summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx512
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/intel-ipsec-mb/avx512
parentInitial commit. (diff)
downloadceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx512')
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm477
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm727
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm1524
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm2382
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm3536
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm4272
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm320
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm280
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c1066
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm524
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm367
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm433
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm445
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm29
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm29
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm384
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm413
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm402
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm439
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm758
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm595
33 files changed, 19771 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm
new file mode 100644
index 000000000..ce33caa92
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm
@@ -0,0 +1,477 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/aes_common.asm"
+
+%define zIV zmm0
+%define zBLK_0_3 zmm1
+%define zBLK_4_7 zmm2
+%define zBLK_8_11 zmm3
+%define zBLK_12_15 zmm4
+%define zTMP0 zmm5
+%define zTMP1 zmm6
+%define zTMP2 zmm7
+%define zTMP3 zmm8
+
+%define ZKEY0 zmm17
+%define ZKEY1 zmm18
+%define ZKEY2 zmm19
+%define ZKEY3 zmm20
+%define ZKEY4 zmm21
+%define ZKEY5 zmm22
+%define ZKEY6 zmm23
+%define ZKEY7 zmm24
+%define ZKEY8 zmm25
+%define ZKEY9 zmm26
+%define ZKEY10 zmm27
+%define ZKEY11 zmm28
+%define ZKEY12 zmm29
+%define ZKEY13 zmm30
+%define ZKEY14 zmm31
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; macro to preload keys
+;;; - uses ZKEY[0-14] registers (ZMM)
+%macro LOAD_KEYS 2
+%define %%KEYS %1 ; [in] key pointer
+%define %%NROUNDS %2 ; [in] numerical value, number of AES rounds
+ ; excluding 1st and last rounds.
+ ; Example: AES-128 -> value 9
+
+%assign i 0
+%rep (%%NROUNDS + 2)
+ vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i]
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro
+;;; code as the number of final blocks is variable.
+;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0)
+
+%macro FINAL_BLOCKS 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LAST_CIPH_BLK %3 ; [in/out] ZMM with IV/last cipher blk (in idx 3)
+%define %%num_final_blocks %4 ; [in] numerical value (1 - 15)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%IA0 %13 ; [clobbered] GP temporary
+%define %%NROUNDS %14 ; [in] number of rounds; numerical value
+
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+
+ ;; Prepare final cipher text blocks to
+ ;; be XOR'd later after AESDEC
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+%if %%num_final_blocks > 4
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+%endif
+%if %%num_final_blocks > 8
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+%endif
+%if %%num_final_blocks > 12
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+%endif
+
+ ;; Update IV with last cipher block
+ ;; to be used later in DECRYPT_16_PARALLEL
+%if %%num_final_blocks == 1
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2
+%elif %%num_final_blocks == 2
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4
+%elif %%num_final_blocks == 3
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6
+%elif %%num_final_blocks == 4
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3
+%elif %%num_final_blocks == 5
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2
+%elif %%num_final_blocks == 6
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4
+%elif %%num_final_blocks == 7
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6
+%elif %%num_final_blocks == 8
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7
+%elif %%num_final_blocks == 9
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2
+%elif %%num_final_blocks == 10
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4
+%elif %%num_final_blocks == 11
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6
+%elif %%num_final_blocks == 12
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11
+%elif %%num_final_blocks == 13
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2
+%elif %%num_final_blocks == 14
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4
+%elif %%num_final_blocks == 15
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6
+%endif
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ %%num_final_blocks, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+%if %%num_final_blocks > 4
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+%endif
+%if %%num_final_blocks > 8
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+%endif
+%if %%num_final_blocks > 12
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+%endif
+
+ ;; write plain text back to output
+ ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+
+%endmacro ; FINAL_BLOCKS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main AES-CBC decrypt macro
+;;; - operates on single stream
+;;; - decrypts 16 blocks at a time
+%macro DECRYPT_16_PARALLEL 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LENGTH %3 ; [in/out] number of bytes to process
+%define %%LAST_CIPH_BLK %4 ; [in/out] ZMM with IV (first block) or last cipher block (idx 3)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%NROUNDS %13 ; [in] number of rounds; numerical value
+%define %%IA0 %14 ; [clobbered] GP temporary
+
+ vmovdqu8 %%CIPHER_PLAIN_0_3, [%%CIPH_IN]
+ vmovdqu8 %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64]
+ vmovdqu8 %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128]
+ vmovdqu8 %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192]
+
+ ;; prepare first set of cipher blocks for later XOR'ing
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+
+ ;; store last cipher text block to be used for next 16 blocks
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+
+ ;; write plain text back to output
+ vmovdqu8 [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3
+ vmovdqu8 [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7
+ vmovdqu8 [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11
+ vmovdqu8 [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15
+
+ ;; adjust input pointer and length
+ sub %%LENGTH, (16 * 16)
+ add %%CIPH_IN, (16 * 16)
+ add %%PLAIN_OUT, (16 * 16)
+
+%endmacro ; DECRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; AES_CBC_DEC macro decrypts given data.
+;;; Flow:
+;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks
+;;; - Decrypt final blocks (1-15 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AES_CBC_DEC 7
+%define %%CIPH_IN %1 ;; [in] pointer to input buffer
+%define %%PLAIN_OUT %2 ;; [in] pointer to output buffer
+%define %%KEYS %3 ;; [in] pointer to expanded keys
+%define %%IV %4 ;; [in] pointer to IV
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%TMP %7 ;; [clobbered] GP register
+
+ cmp %%LENGTH, 0
+ je %%cbc_dec_done
+
+ vinserti64x2 zIV, zIV, [%%IV], 3
+
+ ;; preload keys
+ LOAD_KEYS %%KEYS, %%NROUNDS
+
+%%decrypt_16_parallel:
+ cmp %%LENGTH, 256
+ jb %%final_blocks
+
+ DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \
+ zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \
+ zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP
+ jmp %%decrypt_16_parallel
+
+%%final_blocks:
+ ;; get num final blocks
+ shr %%LENGTH, 4
+ and %%LENGTH, 0xf
+ je %%cbc_dec_done
+
+ cmp %%LENGTH, 8
+ je %%final_num_blocks_is_8
+ jl %%final_blocks_is_1_7
+
+ ; Final blocks 9-15
+ cmp %%LENGTH, 12
+ je %%final_num_blocks_is_12
+ jl %%final_blocks_is_9_11
+
+ ; Final blocks 13-15
+ cmp %%LENGTH, 15
+ je %%final_num_blocks_is_15
+ cmp %%LENGTH, 14
+ je %%final_num_blocks_is_14
+ cmp %%LENGTH, 13
+ je %%final_num_blocks_is_13
+
+%%final_blocks_is_9_11:
+ cmp %%LENGTH, 11
+ je %%final_num_blocks_is_11
+ cmp %%LENGTH, 10
+ je %%final_num_blocks_is_10
+ cmp %%LENGTH, 9
+ je %%final_num_blocks_is_9
+
+%%final_blocks_is_1_7:
+ cmp %%LENGTH, 4
+ je %%final_num_blocks_is_4
+ jl %%final_blocks_is_1_3
+
+ ; Final blocks 5-7
+ cmp %%LENGTH, 7
+ je %%final_num_blocks_is_7
+ cmp %%LENGTH, 6
+ je %%final_num_blocks_is_6
+ cmp %%LENGTH, 5
+ je %%final_num_blocks_is_5
+
+%%final_blocks_is_1_3:
+ cmp %%LENGTH, 3
+ je %%final_num_blocks_is_3
+ cmp %%LENGTH, 2
+ je %%final_num_blocks_is_2
+ jmp %%final_num_blocks_is_1
+
+
+%%final_num_blocks_is_15:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_14:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_13:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_12:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_11:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_10:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_9:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_8:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_7:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_6:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_5:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_4:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_3:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_2:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_1:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+
+%%cbc_dec_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_128_vaes_avx512,function,internal)
+aes_cbc_dec_128_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_192_vaes_avx512,function,internal)
+aes_cbc_dec_192_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_256_vaes_avx512,function,internal)
+aes_cbc_dec_256_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
diff --git a/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
new file mode 100644
index 000000000..c4b1dd561
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
@@ -0,0 +1,727 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routines to do 128/192/256 bit CBC AES encrypt
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+struc STACK
+_gpr_save: resq 3
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IA0 rax
+%define IA1 rbx
+%define IA2 arg3
+%define IA3 arg4
+%define IA4 rbp
+%define IA5 r8
+%define IA6 r9
+%define IA7 r10
+%define IA8 r11
+%define IA9 r13
+%define IA10 r14
+%define IA11 r15
+%define IA12 r12
+
+%define ZIV00_03 zmm8
+%define ZIV04_07 zmm9
+%define ZIV08_11 zmm10
+%define ZIV12_15 zmm11
+
+%define ZT0 zmm16
+%define ZT1 zmm17
+%define ZT2 zmm18
+%define ZT3 zmm19
+%define ZT4 zmm20
+%define ZT5 zmm21
+%define ZT6 zmm22
+%define ZT7 zmm23
+%define ZT8 zmm24
+%define ZT9 zmm25
+%define ZT10 zmm26
+%define ZT11 zmm27
+%define ZT12 zmm28
+%define ZT13 zmm29
+%define ZT14 zmm30
+%define ZT15 zmm31
+
+%define ZT16 zmm12
+%define ZT17 zmm13
+%define ZT18 zmm14
+%define ZT19 zmm15
+
+%define TAB_A0B0A1B1 zmm6
+%define TAB_A2B2A3B3 zmm7
+
+;; Save registers states
+%macro FUNC_SAVE 0
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+%ifndef LINUX
+ mov [GPR_SAVE_AREA + 8*1], rsi
+ mov [GPR_SAVE_AREA + 8*2], rdi
+%endif
+%endmacro
+
+;; Restore register states
+%macro FUNC_RESTORE 0
+ ;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+%ifndef LINUX
+ mov rsi, [GPR_SAVE_AREA + 8*1]
+ mov rdi, [GPR_SAVE_AREA + 8*2]
+%endif
+ add rsp, STACK_size
+ vzeroupper
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Transpose macro - executes 4x4 transpose of 4 ZMM registers
+; in: L0B0-3 out: B0L0-3
+; L1B0-3 B1L0-3
+; L2B0-3 B2L0-3
+; L3B0-3 B3L0-3
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro TRANSPOSE_4x4 8
+%define %%IN_OUT_0 %1
+%define %%IN_OUT_1 %2
+%define %%IN_OUT_2 %3
+%define %%IN_OUT_3 %4
+%define %%ZTMP_0 %5
+%define %%ZTMP_1 %6
+%define %%ZTMP_2 %7
+%define %%ZTMP_3 %8
+
+ vmovdqa64 %%ZTMP_0, TAB_A0B0A1B1
+ vmovdqa64 %%ZTMP_1, %%ZTMP_0
+ vmovdqa64 %%ZTMP_2, TAB_A2B2A3B3
+ vmovdqa64 %%ZTMP_3, %%ZTMP_2
+
+ vpermi2q %%ZTMP_0, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_1, %%IN_OUT_2, %%IN_OUT_3
+ vpermi2q %%ZTMP_2, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_3, %%IN_OUT_2, %%IN_OUT_3
+
+ vshufi64x2 %%IN_OUT_0, %%ZTMP_0, %%ZTMP_1, 0x44
+ vshufi64x2 %%IN_OUT_2, %%ZTMP_2, %%ZTMP_3, 0x44
+ vshufi64x2 %%IN_OUT_1, %%ZTMP_0, %%ZTMP_1, 0xee
+ vshufi64x2 %%IN_OUT_3, %%ZTMP_2, %%ZTMP_3, 0xee
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LOAD_STORE - loads/stores 1-4 blocks (16 bytes) for 4 lanes into ZMM registers
+; - Loads 4 blocks by default
+; - Pass %%MASK_REG argument to load/store 1-3 blocks (optional)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro LOAD_STORE_x4 15-16
+%define %%LANE_A %1 ; [in] lane index to load/store (numerical)
+%define %%LANE_B %2 ; [in] lane index to load/store (numerical)
+%define %%LANE_C %3 ; [in] lane index to load/store (numerical)
+%define %%LANE_D %4 ; [in] lane index to load/store (numerical)
+%define %%DATA_PTR %5 ; [in] GP reg with ptr to lane input table
+%define %%OFFSET %6 ; [in] GP reg input/output buffer offset
+%define %%ZDATA0 %7 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA1 %8 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA2 %9 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA3 %10 ; [in/out] ZMM reg to load/store data
+%define %%GP0 %11 ; [clobbered] tmp GP reg
+%define %%GP1 %12 ; [clobbered] tmp GP reg
+%define %%GP2 %13 ; [clobbered] tmp GP reg
+%define %%GP3 %14 ; [clobbered] tmp GP reg
+%define %%LOAD_STORE %15 ; [in] string value to select LOAD or STORE
+%define %%MASK_REG %16 ; [in] mask reg used for load/store mask
+%define %%NUM_ARGS %0
+
+ mov %%GP0, [%%DATA_PTR + 8*(%%LANE_A)]
+ mov %%GP1, [%%DATA_PTR + 8*(%%LANE_B)]
+ mov %%GP2, [%%DATA_PTR + 8*(%%LANE_C)]
+ mov %%GP3, [%%DATA_PTR + 8*(%%LANE_D)]
+
+%if %%NUM_ARGS <= 15 ;; %%MASK_REG not set, assume 4 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3, [%%GP3 + %%OFFSET]
+%else ; STORE8
+ vmovdqu8 [%%GP0 + %%OFFSET], %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET], %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET], %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET], %%ZDATA3
+%endif
+%else ;; %%MASK_REG argument passed - 1, 2, or 3 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0{%%MASK_REG}{z}, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1{%%MASK_REG}{z}, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2{%%MASK_REG}{z}, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3{%%MASK_REG}{z}, [%%GP3 + %%OFFSET]
+%else ; STORE
+ vmovdqu8 [%%GP0 + %%OFFSET]{%%MASK_REG}, %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET]{%%MASK_REG}, %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET]{%%MASK_REG}, %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET]{%%MASK_REG}, %%ZDATA3
+%endif
+%endif ;; %%NUM_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; AESENC_ROUNDS_x16 macro
+; - 16 lanes, 1 block per lane
+; - it handles special cases: the last and zero rounds
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AESENC_ROUNDS_x16 5
+%define %%L00_03 %1 ; [in/out] ZMM with lane 0-3 blocks
+%define %%L04_07 %2 ; [in/out] ZMM with lane 4-7 blocks
+%define %%L08_11 %3 ; [in/out] ZMM with lane 8-11 blocks
+%define %%L12_15 %4 ; [in/out] ZMM with lane 12-15 blocks
+%define %%NROUNDS %5 ; [in] number of aes rounds
+
+%define %%KP ARG + _aesarg_key_tab
+%define K00_03_OFFSET 0
+%define K04_07_OFFSET 64
+%define K08_11_OFFSET 128
+%define K12_15_OFFSET 192
+
+%assign ROUND 0
+%rep (%%NROUNDS + 2)
+
+%if ROUND < 1
+ ;; XOR with key 0 before doing aesenc
+ vpxorq %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vpxorq %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vpxorq %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vpxorq %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+%if ROUND <= %%NROUNDS
+
+ ;; rounds 1 to 9/11/13
+ vaesenc %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenc %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenc %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenc %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+ ;; the last round
+ vaesenclast %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%endif
+%endif
+
+%assign ROUND (ROUND + 1)
+%endrep
+
+%endmacro ; AESENC_ROUNDS_x16
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_PARALLEL - Encode all blocks up to multiple of 4
+; - Operation
+; - loop encrypting %%LENGTH bytes of input data
+; - each loop encrypts 4 blocks across 16 lanes
+; - stop when %%LENGTH is less than 64 bytes (4 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_PARALLEL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %7 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %10 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %14 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %18 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %22 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %26 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %27 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %28 ;; [clobbered] tmp GP register
+%define %%TMP1 %29 ;; [clobbered] tmp GP register
+%define %%TMP2 %30 ;; [clobbered] tmp GP register
+%define %%TMP3 %31 ;; [clobbered] tmp GP register
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+ ;; check for at least 4 blocks
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_done
+
+ xor %%IDX, %%IDX
+ ;; skip length check on first loop
+ jmp %%encrypt_16_first
+
+%%encrypt_16_start:
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_end
+
+%%encrypt_16_first:
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor first plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B3L00_03, %%B2L00_03
+ vpxorq %%B3L04_07, %%B2L04_07
+ vpxorq %%B3L08_11, %%B2L08_11
+ vpxorq %%B3L12_15, %%B2L12_15
+
+ ;; encrypt block 3 lanes
+ AESENC_ROUNDS_x16 %%B3L00_03, %%B3L04_07, %%B3L08_11, %%B3L12_15, %%NROUNDS
+
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B3L00_03
+ vmovdqa64 %%ZIV04_07, %%B3L04_07
+ vmovdqa64 %%ZIV08_11, %%B3L08_11
+ vmovdqa64 %%ZIV12_15, %%B3L12_15
+
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ sub %%LENGTH, 64
+ add %%IDX, 64
+ jmp %%encrypt_16_start
+
+%%encrypt_16_end:
+ ;; update in/out pointers
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%%encrypt_16_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_FINAL Encodes final blocks (less than 4) across 16 lanes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_FINAL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%NROUNDS %5 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %6 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %7 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %10 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %14 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %18 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %22 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %26 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %27 ;; [clobbered] tmp GP register
+%define %%TMP1 %28 ;; [clobbered] tmp GP register
+%define %%TMP2 %29 ;; [clobbered] tmp GP register
+%define %%TMP3 %30 ;; [clobbered] tmp GP register
+%define %%NUM_BLKS %31 ;; [in] number of blocks (numerical value)
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+%if %%NUM_BLKS == 1
+ mov %%TMP0, 0x0000_0000_0000_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 2
+ mov %%TMP0, 0x0000_0000_ffff_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 3
+ mov %%TMP0, 0x0000_ffff_ffff_ffff
+ kmovq k1, %%TMP0
+%endif
+ xor %%IDX, %%IDX
+
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+%if %%NUM_BLKS == 1
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B0L00_03
+ vmovdqa64 %%ZIV04_07, %%B0L04_07
+ vmovdqa64 %%ZIV08_11, %%B0L08_11
+ vmovdqa64 %%ZIV12_15, %%B0L12_15
+%endif
+
+%if %%NUM_BLKS >= 2
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 2
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B1L00_03
+ vmovdqa64 %%ZIV04_07, %%B1L04_07
+ vmovdqa64 %%ZIV08_11, %%B1L08_11
+ vmovdqa64 %%ZIV12_15, %%B1L12_15
+%endif
+
+%if %%NUM_BLKS >= 3
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 3
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B2L00_03
+ vmovdqa64 %%ZIV04_07, %%B2L04_07
+ vmovdqa64 %%ZIV08_11, %%B2L08_11
+ vmovdqa64 %%ZIV12_15, %%B2L12_15
+%endif
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; update in/out pointers
+ mov %%IDX, %%NUM_BLKS
+ shl %%IDX, 4
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CBC_ENC Encodes given data.
+; Requires the input data be at least 1 block (16 bytes) long
+; Input: Number of AES rounds
+;
+; First encrypts block up to multiple of 4
+; Then encrypts final blocks (less than 4)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CBC_ENC 1
+%define %%ROUNDS %1
+
+ ;; load transpose tables
+ vmovdqa64 TAB_A0B0A1B1, [rel A0B0A1B1]
+ vmovdqa64 TAB_A2B2A3B3, [rel A2B2A3B3]
+
+ ;; load IV's per lane
+ vmovdqa64 ZIV00_03, [ARG + _aesarg_IV + 16*0]
+ vmovdqa64 ZIV04_07, [ARG + _aesarg_IV + 16*4]
+ vmovdqa64 ZIV08_11, [ARG + _aesarg_IV + 16*8]
+ vmovdqa64 ZIV12_15, [ARG + _aesarg_IV + 16*12]
+
+ ENCRYPT_16_PARALLEL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ LEN, %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, \
+ ZT6, ZT7, ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, \
+ ZT15, ZT16, ZT17, ZT18, ZT19, IA2, IA3, IA4, IA5
+
+ ;; get num remaining blocks
+ shr LEN, 4
+ and LEN, 3
+ je %%_cbc_enc_done
+ cmp LEN, 1
+ je %%_final_blocks_1
+ cmp LEN, 2
+ je %%_final_blocks_2
+
+%%_final_blocks_3:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 3
+ jmp %%_cbc_enc_done
+%%_final_blocks_1:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 1
+ jmp %%_cbc_enc_done
+%%_final_blocks_2:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 2
+%%_cbc_enc_done:
+ ;; store IV's per lane
+ vmovdqa64 [ARG + _aesarg_IV + 16*0], ZIV00_03
+ vmovdqa64 [ARG + _aesarg_IV + 16*4], ZIV04_07
+ vmovdqa64 [ARG + _aesarg_IV + 16*8], ZIV08_11
+ vmovdqa64 [ARG + _aesarg_IV + 16*12], ZIV12_15
+%endmacro
+
+
+section .data
+;;;;;;;;;;;;;;;;;;
+; Transpose tables
+;;;;;;;;;;;;;;;;;;
+default rel
+
+align 64
+A0B0A1B1:
+ dq 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb
+
+align 64
+A2B2A3B3:
+ dq 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, 0xf
+
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_128_vaes_avx512,function,internal)
+aes_cbc_enc_128_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 9
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_192_vaes_avx512,function,internal)
+aes_cbc_enc_192_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 11
+ FUNC_RESTORE
+ ret
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_256_vaes_avx512,function,internal)
+aes_cbc_enc_256_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 13
+ FUNC_RESTORE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm
new file mode 100644
index 000000000..50ff86b6e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm
@@ -0,0 +1,1524 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "mb_mgr_datastruct.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+
+%include "include/aes_common.asm"
+%include "include/const.inc"
+
+section .data
+default rel
+
+align 16
+ONE:
+ dq 0x0000000000000001, 0x0000000000000000
+
+align 64
+SHUF_MASK:
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+align 64
+ddq_add_13_16:
+ dq 0x000000000000000d, 0x0000000000000000
+ dq 0x000000000000000e, 0x0000000000000000
+ dq 0x000000000000000f, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+
+align 64
+ddq_add_9_12:
+ dq 0x0000000000000009, 0x0000000000000000
+ dq 0x000000000000000a, 0x0000000000000000
+ dq 0x000000000000000b, 0x0000000000000000
+ dq 0x000000000000000c, 0x0000000000000000
+
+align 64
+ddq_add_5_8:
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_1_4:
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_12_15:
+ dq 0x000000000000000c, 0x0000000000000000
+ dq 0x000000000000000d, 0x0000000000000000
+ dq 0x000000000000000e, 0x0000000000000000
+ dq 0x000000000000000f, 0x0000000000000000
+
+align 64
+ddq_add_8_11:
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000009, 0x0000000000000000
+ dq 0x000000000000000a, 0x0000000000000000
+ dq 0x000000000000000b, 0x0000000000000000
+
+align 64
+ddq_add_4_7:
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+
+align 64
+ddq_add_0_3:
+ dq 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+
+align 64
+ddq_add_16:
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+
+align 64
+byte64_len_to_mask_table:
+ dq 0x0000000000000000, 0x0000000000000001
+ dq 0x0000000000000003, 0x0000000000000007
+ dq 0x000000000000000f, 0x000000000000001f
+ dq 0x000000000000003f, 0x000000000000007f
+ dq 0x00000000000000ff, 0x00000000000001ff
+ dq 0x00000000000003ff, 0x00000000000007ff
+ dq 0x0000000000000fff, 0x0000000000001fff
+ dq 0x0000000000003fff, 0x0000000000007fff
+ dq 0x000000000000ffff, 0x000000000001ffff
+ dq 0x000000000003ffff, 0x000000000007ffff
+ dq 0x00000000000fffff, 0x00000000001fffff
+ dq 0x00000000003fffff, 0x00000000007fffff
+ dq 0x0000000000ffffff, 0x0000000001ffffff
+ dq 0x0000000003ffffff, 0x0000000007ffffff
+ dq 0x000000000fffffff, 0x000000001fffffff
+ dq 0x000000003fffffff, 0x000000007fffffff
+ dq 0x00000000ffffffff, 0x00000001ffffffff
+ dq 0x00000003ffffffff, 0x00000007ffffffff
+ dq 0x0000000fffffffff, 0x0000001fffffffff
+ dq 0x0000003fffffffff, 0x0000007fffffffff
+ dq 0x000000ffffffffff, 0x000001ffffffffff
+ dq 0x000003ffffffffff, 0x000007ffffffffff
+ dq 0x00000fffffffffff, 0x00001fffffffffff
+ dq 0x00003fffffffffff, 0x00007fffffffffff
+ dq 0x0000ffffffffffff, 0x0001ffffffffffff
+ dq 0x0003ffffffffffff, 0x0007ffffffffffff
+ dq 0x000fffffffffffff, 0x001fffffffffffff
+ dq 0x003fffffffffffff, 0x007fffffffffffff
+ dq 0x00ffffffffffffff, 0x01ffffffffffffff
+ dq 0x03ffffffffffffff, 0x07ffffffffffffff
+ dq 0x0fffffffffffffff, 0x1fffffffffffffff
+ dq 0x3fffffffffffffff, 0x7fffffffffffffff
+ dq 0xffffffffffffffff
+
+align 16
+initial_12_IV_counter:
+ dq 0x0000000000000000, 0x0100000000000000
+
+mask_16_bytes:
+ dq 0x000000000000ffff
+
+section .text
+default rel
+
+%ifdef LINUX
+%define arg1 rdi
+%else
+%define arg1 rcx
+%endif
+
+%define ZKEY0 zmm17
+%define ZKEY1 zmm18
+%define ZKEY2 zmm19
+%define ZKEY3 zmm20
+%define ZKEY4 zmm21
+%define ZKEY5 zmm22
+%define ZKEY6 zmm23
+%define ZKEY7 zmm24
+%define ZKEY8 zmm25
+%define ZKEY9 zmm26
+%define ZKEY10 zmm27
+%define ZKEY11 zmm28
+%define ZKEY12 zmm29
+%define ZKEY13 zmm30
+%define ZKEY14 zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+ %define GP_STORAGE (7*8) ; space for 7 GP registers
+%else
+ %define GP_STORAGE (5*8) ; space for 5 GP registers
+%endif
+
+%define STACK_FRAME_SIZE GP_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to maintain the bits from the output text
+;;; when writing out the output blocks, in case there are some bits
+;;; that do not require encryption
+%macro PRESERVE_BITS 12-13
+%define %%RBITS %1 ; [in] Remaining bits in last byte
+%define %%LENGTH %2 ; [in] Length of the last set of blocks
+%define %%CYPH_PLAIN_OUT %3 ; [in] Pointer to output buffer
+%define %%ZIN_OUT %4 ; [in/out] ZMM with last set of output blocks
+%define %%ZTMP0 %5 ; [clobbered] ZMM temporary
+%define %%ZTMP1 %6 ; [clobbered] ZMM temporary
+%define %%ZTMP2 %7 ; [clobbered] ZMM temporary
+%define %%IA0 %8 ; [clobbered] GP temporary
+%define %%IA1 %9 ; [clobbered] GP temporary
+%define %%blocks_to_skip %10 ; [in] Number of blocks to skip from output
+%define %%FULL_PARTIAL %11 ; [in] Last block type selection "full" or "partial"
+%define %%MASKREG %12 ; [clobbered] Mask register
+%define %%DATA_OFFSET %13 ; [in/out] Data offset
+%define %%NUM_ARGS %0
+
+;; offset = number of sets of 4 blocks to skip
+%assign offset (((%%blocks_to_skip) / 4) * 64)
+;; num_left_blocks = number of blocks in the last set
+%assign num_left_blocks (((%%blocks_to_skip) & 3) + 1) ;; Range 1-4 blocks
+
+%if %%NUM_ARGS == 13
+ ;; Load output to get last partial byte
+%ifidn %%FULL_PARTIAL, partial
+ vmovdqu8 %%ZTMP0{%%MASKREG}, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset]
+%else
+ vmovdqu8 %%ZTMP0, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset]
+%endif ; %%FULL_PARTIAL == partial
+%else
+ ;; Load output to get last partial byte (loading up to the last 4 blocks)
+ ZMM_LOAD_MASKED_BLOCKS_0_16 num_left_blocks, %%CYPH_PLAIN_OUT, offset, \
+ %%ZTMP0, no_zmm, no_zmm, no_zmm, %%MASKREG
+%endif ;; %%NUM_ARGS == 13
+
+ ;; Save RCX in temporary GP register
+ mov %%IA0, rcx
+ mov DWORD(%%IA1), 0xff
+ mov cl, BYTE(%%RBITS)
+ shr DWORD(%%IA1), cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, %%IA0
+
+ vmovq XWORD(%%ZTMP1), %%IA1
+
+ ;; Get number of full bytes in last block.
+ ;; Subtracting the bytes in the blocks to skip to the length of whole
+ ;; set of blocks gives us the number of bytes in the last block,
+ ;; but the last block has a partial byte at the end, so an extra byte
+ ;; needs to be subtracted
+ mov %%IA1, %%LENGTH
+ sub %%IA1, (%%blocks_to_skip * 16 + 1)
+ XVPSLLB XWORD(%%ZTMP1), %%IA1, XWORD(%%ZTMP2), %%IA0
+%if num_left_blocks == 4
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x15
+%elif num_left_blocks == 3
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x45
+%elif num_left_blocks == 2
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x51
+%endif ;; No need to shift if there is only one block
+
+ ;; At this point, ZTMP1 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; First, clear the last bits (not to be ciphered) of the last output block
+ ;; %%ZIN_OUT = %%ZIN_OUT AND NOT %%ZTMP1 (0x50 = andA!C)
+ vpternlogq %%ZIN_OUT, %%ZTMP1, %%ZTMP1, 0x50
+
+ ;; Then, set these last bits to the last bits coming from the output
+ ;; %%ZIN_OUT = %%ZIN_OUT OR (%%ZTMP0 AND %%ZTMP1) (0xF8 = orAandBC)
+ vpternlogq %%ZIN_OUT, %%ZTMP0, %%ZTMP1, 0xF8
+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for ENCRYPT_16_PARALLEL
+;;; macro code. It is called only for data lengths 256 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 16 blocks
+;;; - the last 16th block can be partial (lengths between 257 and 367)
+;;; - partial block ciphering is handled within this macro
+
+%macro INITIAL_BLOCKS 26
+%define %%KEY %1 ; [in] pointer to key
+%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] input buffer
+%define %%LENGTH %4 ; [in/out] number of bytes to process
+%define %%DATA_OFFSET %5 ; [in/out] data offset
+%define %%num_initial_blocks %6 ; [in] can be between 0 and 15
+%define %%CTR %7 ; [in] XMM first counter block
+%define %%CTR_1_4 %8 ; [out] ZMM next 1-4 counter blocks
+%define %%CTR_5_8 %9 ; [out] ZMM next 5-8 counter blocks
+%define %%CTR_9_12 %10 ; [out] ZMM next 9-12 counter blocks
+%define %%CTR_13_16 %11 ; [out] ZMM next 13-16 counter blocks
+%define %%ZT1 %12 ; [clobbered] ZMM temporary
+%define %%ZT2 %13 ; [clobbered] ZMM temporary
+%define %%ZT3 %14 ; [clobbered] ZMM temporary
+%define %%ZT4 %15 ; [clobbered] ZMM temporary
+%define %%ZT5 %16 ; [clobbered] ZMM temporary
+%define %%ZT6 %17 ; [clobbered] ZMM temporary
+%define %%ZT7 %18 ; [clobbered] ZMM temporary
+%define %%ZT8 %19 ; [clobbered] ZMM temporary
+%define %%IA0 %20 ; [clobbered] GP temporary
+%define %%IA1 %21 ; [clobbered] GP temporary
+%define %%MASKREG %22 ; [clobbered] mask register
+%define %%SHUFREG %23 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %24 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %26 ; [in] Number of remaining bits in last byte
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+
+%ifidn %%CNTR_TYPE, CNTR
+%define %%VPADD vpaddd
+%else
+%define %%VPADD vpaddq
+%endif
+
+%if %%num_initial_blocks > 0
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, load_4_instead_of_3
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks > 1
+%if %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0
+ %%VPADD YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 4
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 8
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+%elif %%num_initial_blocks <= 12
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+ %%VPADD %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+%endif
+
+ ;; extract new counter value (%%T1)
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks == 1
+ vpshufb %%T1, %%CTR, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks == 2
+ vextracti32x4 %%CTR, YWORD(%%ZT1), 1
+ vpshufb YWORD(%%ZT1), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%ZT1, (%%num_initial_blocks - 1)
+ vpshufb %%ZT1, %%SHUFREG
+%elif %%num_initial_blocks == 5
+ vmovdqa64 %%CTR, %%T2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%T2, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks == 6
+ vextracti32x4 %%CTR, YWORD(%%ZT2), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb YWORD(%%ZT2), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 7
+ vextracti32x4 %%CTR, %%ZT2, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+%elif %%num_initial_blocks = 8
+ vextracti32x4 %%CTR, %%ZT2, 3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+%elif %%num_initial_blocks = 9
+ vmovdqa64 %%CTR, %%T3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%T3, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 10
+ vextracti32x4 %%CTR, YWORD(%%ZT3), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb YWORD(%%ZT3), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 11
+ vextracti32x4 %%CTR, %%ZT3, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+%elif %%num_initial_blocks = 12
+ vextracti32x4 %%CTR, %%ZT3, 3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+%elif %%num_initial_blocks = 13
+ vmovdqa64 %%CTR, %%T4
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb %%T4, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 14
+ vextracti32x4 %%CTR, YWORD(%%ZT4), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb YWORD(%%ZT4), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 15
+ vextracti32x4 %%CTR, %%ZT4, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb %%ZT4, %%SHUFREG
+%endif
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \
+ %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; write cipher/plain text back to output
+ ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4
+
+ ;; adjust data offset and length
+ sub %%LENGTH, (%%num_initial_blocks * 16)
+ add %%DATA_OFFSET, (%%num_initial_blocks * 16)
+%endif ; %%num_initial_blocks > 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; - cipher of %%num_initial_blocks is done
+ ;; - prepare counter blocks for the next 16 blocks (ZT5-ZT8)
+ ;; - shuffle the blocks for AES
+ ;; - encrypt the next 16 blocks
+
+ ;; get text load/store mask (assume full mask by default)
+ mov %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+ ;; This macro is executed for length 256 and up,
+ ;; zero length is checked in CNTR_ENC_DEC.
+ ;; We know there is partial block if:
+ ;; LENGTH - 16*num_initial_blocks < 256
+ cmp %%LENGTH, 256
+ jge %%_initial_partial_block_continue
+ mov %%IA1, rcx
+ mov rcx, 256
+ sub rcx, %%LENGTH
+ shr %%IA0, cl
+ mov rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+ kmovq %%MASKREG, %%IA0
+ ;; load plain or cipher text
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+
+ ;; prepare next counter blocks
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+%if %%num_initial_blocks > 0
+ vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_1_4]
+ vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_5_8]
+ vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_9_12]
+ vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_13_16]
+%else
+ vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_8_11]
+ vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+
+ vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG
+ vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG
+ vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG
+ vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; check if this is the end of the message
+ cmp %%LENGTH, 256
+ jg %%store_output
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, 15, partial, %%MASKREG, %%DATA_OFFSET
+
+%endif
+
+%%store_output:
+ ;; write cipher/plain text back to output
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4
+
+ ;; check if there is partial block
+ cmp %%LENGTH, 256
+ jl %%_initial_partial_done
+ ;; adjust offset and length
+ add %%DATA_OFFSET, 256
+ sub %%LENGTH, 256
+ jmp %%_initial_blocks_done
+%%_initial_partial_done:
+ ;; zero the length (all encryption is complete)
+ xor %%LENGTH, %%LENGTH
+%%_initial_blocks_done:
+
+%endmacro ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - It is not meant to cipher counter blocks for the main by16 loop.
+;;; Just ciphers amount of blocks.
+;;; - Small packets (<256 bytes)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 21
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] text out pointer
+%define %%PLAIN_CYPH_IN %3 ; [in] text out pointer
+%define %%LENGTH %4 ; [in/clobbered] length in bytes
+%define %%num_initial_blocks %5 ; [in] can be from 1 to 16 (not 0)
+%define %%CTR %6 ; [in/out] current counter value
+%define %%ZT1 %7 ; [clobbered] ZMM temporary
+%define %%ZT2 %8 ; [clobbered] ZMM temporary
+%define %%ZT3 %9 ; [clobbered] ZMM temporary
+%define %%ZT4 %10 ; [clobbered] ZMM temporary
+%define %%ZT5 %11 ; [clobbered] ZMM temporary
+%define %%ZT6 %12 ; [clobbered] ZMM temporary
+%define %%ZT7 %13 ; [clobbered] ZMM temporary
+%define %%ZT8 %14 ; [clobbered] ZMM temporary
+%define %%IA0 %15 ; [clobbered] GP temporary
+%define %%IA1 %16 ; [clobbered] GP temporary
+%define %%MASKREG %17 ; [clobbered] mask register
+%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %19 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %21 ; [in] Number of remaining bits in last byte
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+
+ ;; get load/store mask
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+ sub %%IA1, 192
+%elif %%num_initial_blocks > 8
+ sub %%IA1, 128
+%elif %%num_initial_blocks > 4
+ sub %%IA1, 64
+%endif
+ kmovq %%MASKREG, [%%IA0 + %%IA1*8]
+
+ ;; load plain/cipher text
+ ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%MASKREG
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vmovdqa64 XWORD(%%ZT1), XWORD(%%CTR)
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 4
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 8
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+%elif %%num_initial_blocks <= 12
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+
+ ;; shuffle the counters for AES rounds
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, \
+ %%SHUFREG, %%SHUFREG, %%SHUFREG, %%SHUFREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \
+ %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+%if %%num_initial_blocks <= 4
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT1, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%elif %%num_initial_blocks <= 8
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT2, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%elif %%num_initial_blocks <= 12
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT3, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%else
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%endif
+
+%endif
+
+%%store_output:
+ ;; write cipher/plain text back to output
+ ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main CNTR macro
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+%macro ENCRYPT_16_PARALLEL 26
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR_1_4 %5 ; [in/out] ZMM next 1-4 counter blocks
+%define %%CTR_5_8 %6 ; [in/out] ZMM next 5-8 counter blocks
+%define %%CTR_9_12 %7 ; [in/out] ZMM next 9-12 counter blocks
+%define %%CTR_13_16 %8 ; [in/out] ZMM next 13-16 counter blocks
+%define %%FULL_PARTIAL %9 ; [in] last block type selection "full" or "partial"
+%define %%IA0 %10 ; [clobbered] temporary GP register
+%define %%IA1 %11 ; [clobbered] temporary GP register
+%define %%LENGTH %12 ; [in] length
+%define %%ZT1 %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %15 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT6 %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7 %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8 %20 ; [clobbered] temporary ZMM (cipher)
+%define %%MASKREG %21 ; [clobbered] mask register for partial loads/stores
+%define %%SHUFREG %22 ; [in] ZMM register with shuffle mask
+%define %%ADD8REG %23 ; [in] ZMM register with increment by 8 mask
+%define %%NROUNDS %24 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %26 ; [in] Number of remaining bits in last byte
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+%else
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+ sub %%IA1, (3*64)
+ kmovq %%MASKREG, [%%IA0 + 8*%%IA1]
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; populate counter blocks
+ ;; %%CTR is shuffled outside the scope of this macro
+ ;; it has to be kept in unshuffled form
+ vpaddd %%CTR_1_4, %%CTR_1_4, %%ADD8REG
+ vpaddd %%CTR_5_8, %%CTR_5_8, %%ADD8REG
+ vpaddd %%CTR_9_12, %%CTR_9_12, %%ADD8REG
+ vpaddd %%CTR_13_16, %%CTR_13_16, %%ADD8REG
+ vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG
+ vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG
+ vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG
+ vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG
+
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if this is the last round
+ cmp %%LENGTH, 256
+ jg %%store_output
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, 15, %%FULL_PARTIAL, %%MASKREG, %%DATA_OFFSET
+
+%endif
+
+%%store_output:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store the text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192], %%ZT4
+%else
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4
+%endif
+
+%endmacro ; ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ mov rax, rsp
+
+ sub rsp, STACK_FRAME_SIZE
+ and rsp, ~63
+
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov [rsp + 2*8], r14
+%endif
+ mov [rsp + 3*8], rax ; stack
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 4*8], rdi
+ mov [rsp + 5*8], rsi
+%endif
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+ vzeroupper
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [rsp + 4*8]
+ mov rsi, [rsp + 5*8]
+%endif
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r14, [rsp + 2*8]
+%endif
+ mov rsp, [rsp + 3*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;; INITIAL_BLOCKS_PARTIAL is invoked
+%macro CNTR_ENC_DEC_SMALL 21
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] input buffer
+%define %%LENGTH %4 ; [in] data length
+%define %%NUM_BLOCKS %5 ; [in] number of blocks to process 1 to 8
+%define %%CTR %6 ; [in/out] XMM counter block
+%define %%ZTMP1 %7 ; [clobbered] ZMM register
+%define %%ZTMP2 %8 ; [clobbered] ZMM register
+%define %%ZTMP3 %9 ; [clobbered] ZMM register
+%define %%ZTMP4 %10 ; [clobbered] ZMM register
+%define %%ZTMP5 %11 ; [clobbered] ZMM register
+%define %%ZTMP6 %12 ; [clobbered] ZMM register
+%define %%ZTMP7 %13 ; [clobbered] ZMM register
+%define %%ZTMP8 %14 ; [clobbered] ZMM register
+%define %%IA0 %15 ; [clobbered] GP register
+%define %%IA1 %16 ; [clobbered] GP register
+%define %%MASKREG %17 ; [clobbered] mask register
+%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %19 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %21 ; [in] Number of remaining bits in last byte
+
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ jl %%_small_initial_blocks_is_1_7
+
+ ; Initial blocks 9-16
+ cmp %%NUM_BLOCKS, 12
+ je %%_small_initial_num_blocks_is_12
+ jl %%_small_initial_blocks_is_9_11
+
+ ; Initial blocks 13-16
+ cmp %%NUM_BLOCKS, 16
+ je %%_small_initial_num_blocks_is_16
+ cmp %%NUM_BLOCKS, 15
+ je %%_small_initial_num_blocks_is_15
+ cmp %%NUM_BLOCKS, 14
+ je %%_small_initial_num_blocks_is_14
+ cmp %%NUM_BLOCKS, 13
+ je %%_small_initial_num_blocks_is_13
+
+%%_small_initial_blocks_is_9_11:
+ cmp %%NUM_BLOCKS, 11
+ je %%_small_initial_num_blocks_is_11
+ cmp %%NUM_BLOCKS, 10
+ je %%_small_initial_num_blocks_is_10
+ cmp %%NUM_BLOCKS, 9
+ je %%_small_initial_num_blocks_is_9
+
+%%_small_initial_blocks_is_1_7:
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ jl %%_small_initial_blocks_is_1_3
+
+ ; Initial blocks 5-7
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+
+%%_small_initial_blocks_is_1_3:
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_16:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 16, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_15:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 15, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_14:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 14, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_13:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 13, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_12:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 12, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_11:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 11, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_10:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 10, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_9:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 9, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+%%_small_initial_num_blocks_is_8:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 8, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 7, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 6, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 5, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 4, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 3, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 2, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 1, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; CNTR_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CNTR_ENC_DEC Encodes/Decodes given data.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: job structure and number of AES rounds
+; Output: job structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CNTR_ENC_DEC 3
+%define %%JOB %1 ; [in/out] job
+%define %%NROUNDS %2 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %3 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+%define %%KEY rax
+%define %%CYPH_PLAIN_OUT rdx
+%define %%PLAIN_CYPH_IN r8
+%define %%LENGTH r9
+%define %%DATA_OFFSET r13
+%define %%RBITS r14
+
+%define %%IA0 r10
+%define %%IA1 r11
+%define %%IA2 r12
+
+%define %%CTR_BLOCKx xmm0
+%define %%CTR_BLOCK_1_4 zmm1
+%define %%CTR_BLOCK_5_8 zmm2
+%define %%CTR_BLOCK_9_12 zmm3
+%define %%CTR_BLOCK_13_16 zmm4
+
+%define %%ZTMP0 zmm5
+%define %%ZTMP1 zmm6
+%define %%ZTMP2 zmm7
+%define %%ZTMP3 zmm8
+%define %%ZTMP4 zmm9
+%define %%ZTMP5 zmm10
+%define %%ZTMP6 zmm11
+%define %%ZTMP7 zmm12
+%define %%SHUFREG zmm13
+%define %%ADD8REG zmm14
+
+%define %%MASKREG k1
+
+;;; Macro flow:
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 16 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 16x16 byte blocks at a time until all are done in %%_encrypt_by_16_new
+
+ mov %%LENGTH, [%%JOB + _msg_len_to_cipher]
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov %%RBITS, %%LENGTH
+ add %%LENGTH, 7
+ shr %%LENGTH, 3 ; LENGTH will hold number of bytes (including partial byte)
+ and %%RBITS, 7 ; Get remainder bits in last byte (0-7)
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%LENGTH, 0
+%else
+ or %%LENGTH, %%LENGTH
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+
+ mov %%PLAIN_CYPH_IN, [%%JOB + _src]
+ add %%PLAIN_CYPH_IN, [%%JOB + _cipher_start_src_offset_in_bytes]
+ mov %%CYPH_PLAIN_OUT, [%%JOB + _dst]
+ mov %%KEY, [%%JOB + _aes_enc_key_expanded]
+
+ ;; Prepare round keys (only first 10, due to lack of registers)
+%assign i 0
+%rep (%%NROUNDS + 2)
+ vbroadcastf64x2 ZKEY %+ i, [%%KEY + 16*i]
+%assign i (i + 1)
+%endrep
+
+ mov %%IA1, [%%JOB + _iv]
+%ifidn %%CNTR_TYPE, CNTR
+ ;; Prepare initial mask to read 12 IV bytes
+ mov %%IA0, 0x0000_0000_0000_0fff
+ vmovdqa %%CTR_BLOCKx, [rel initial_12_IV_counter]
+ mov %%IA2, [%%JOB + _iv_len_in_bytes]
+ test %%IA2, 16
+ ;; Set mask to read 16 IV bytes if iv_len = 16
+ cmovnz %%IA0, [rel mask_16_bytes]
+
+ kmovq %%MASKREG, %%IA0
+ vmovdqu8 %%CTR_BLOCKx{%%MASKREG}, [%%IA1]
+%else ;; CNTR_BIT
+ ;; Read the full 16 bytes of IV
+ vmovdqu8 %%CTR_BLOCKx, [%%IA1]
+%endif ;; CNTR/CNTR_BIT
+
+ vmovdqa64 %%SHUFREG, [rel SHUF_MASK]
+ ;; store IV as counter in LE format
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUFREG)
+
+ ;; Determine how many blocks to process in INITIAL
+ mov %%IA1, %%LENGTH
+ shr %%IA1, 4
+ and %%IA1, 0xf
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ mov %%IA0, %%LENGTH
+ and %%IA0, 0xf
+ add %%IA0, 0xf
+ shr %%IA0, 4
+ add %%IA1, %%IA0
+ ;; %%IA1 can be in the range from 0 to 16
+
+ ;; Less than 256B will be handled by the small message code, which
+ ;; can process up to 16 x blocks (16 bytes each)
+ cmp %%LENGTH, 256
+ jge %%_large_message_path
+
+ CNTR_ENC_DEC_SMALL \
+ %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, \
+ %%IA1, %%CTR_BLOCKx, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%IA0, %%IA2, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+
+ jmp %%_enc_dec_done
+
+%%_large_message_path:
+ ;; Still, don't allow 16 INITIAL blocks since this will
+ ;; can be handled by the x16 partial loop.
+ and %%IA1, 0xf
+ je %%_initial_num_blocks_is_0
+ cmp %%IA1, 15
+ je %%_initial_num_blocks_is_15
+ cmp %%IA1, 14
+ je %%_initial_num_blocks_is_14
+ cmp %%IA1, 13
+ je %%_initial_num_blocks_is_13
+ cmp %%IA1, 12
+ je %%_initial_num_blocks_is_12
+ cmp %%IA1, 11
+ je %%_initial_num_blocks_is_11
+ cmp %%IA1, 10
+ je %%_initial_num_blocks_is_10
+ cmp %%IA1, 9
+ je %%_initial_num_blocks_is_9
+ cmp %%IA1, 8
+ je %%_initial_num_blocks_is_8
+ cmp %%IA1, 7
+ je %%_initial_num_blocks_is_7
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+ jmp %%_initial_num_blocks_is_1
+
+ and %%IA1, 0xf
+ je %%_initial_num_blocks_is_0
+
+ cmp %%IA1, 8
+ je %%_initial_num_blocks_is_8
+ jl %%_initial_blocks_is_1_7
+
+ ; Initial blocks 9-15
+ cmp %%IA1, 12
+ je %%_initial_num_blocks_is_12
+ jl %%_initial_blocks_is_9_11
+
+ ; Initial blocks 13-15
+ cmp %%IA1, 15
+ je %%_initial_num_blocks_is_15
+ cmp %%IA1, 14
+ je %%_initial_num_blocks_is_14
+ cmp %%IA1, 13
+ je %%_initial_num_blocks_is_13
+
+%%_initial_blocks_is_9_11:
+ cmp %%IA1, 11
+ je %%_initial_num_blocks_is_11
+ cmp %%IA1, 10
+ je %%_initial_num_blocks_is_10
+ cmp %%IA1, 9
+ je %%_initial_num_blocks_is_9
+
+%%_initial_blocks_is_1_7:
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ jl %%_initial_blocks_is_1_3
+
+ ; Initial blocks 5-7
+ cmp %%IA1, 7
+ je %%_initial_num_blocks_is_7
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+
+%%_initial_blocks_is_1_3:
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_15:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 15, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_14:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 14, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_13:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 13, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_12:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 12, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_11:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 11, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_10:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 10, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_9:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 9, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_8:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 8, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 7, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 6, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 5, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 4, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 3, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 2, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 1, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 0, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+
+%%_initial_blocks_encrypted:
+ or %%LENGTH, %%LENGTH
+ je %%_enc_dec_done
+
+ vmovdqa64 %%ADD8REG, [rel ddq_add_16]
+ ;; Process 15 full blocks plus a partial block
+ cmp %%LENGTH, 256
+ jl %%_encrypt_by_16_partial
+
+%%_encrypt_by_16:
+ ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \
+ %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \
+ full, %%IA0, %%IA1, %%LENGTH, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \
+ %%RBITS
+ add %%DATA_OFFSET, 256
+ sub %%LENGTH, 256
+ cmp %%LENGTH, 256
+ jge %%_encrypt_by_16
+
+%%_encrypt_by_16_done:
+ ;; Test to see if we need a by 16 with partial block. At this point
+ ;; bytes remaining should be either zero or between 241-255.
+ or %%LENGTH, %%LENGTH
+ je %%_enc_dec_done
+
+%%_encrypt_by_16_partial:
+
+ ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \
+ %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \
+ partial, %%IA0, %%IA1, %%LENGTH, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \
+ %%RBITS
+
+%%_enc_dec_done:
+
+%endmacro ; CNTR_ENC_DEC
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_128_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_128_submit_vaes_avx512,function,internal)
+aes_cntr_128_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 9, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_192_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_192_submit_vaes_avx512,function,internal)
+aes_cntr_192_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 11, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_256_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_256_submit_vaes_avx512,function,internal)
+aes_cntr_256_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 13, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_128_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_128_submit_vaes_avx512,function,internal)
+aes_cntr_bit_128_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 9, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_192_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_192_submit_vaes_avx512,function,internal)
+aes_cntr_bit_192_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 11, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_256_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_256_submit_vaes_avx512,function,internal)
+aes_cntr_bit_256_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 13, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
new file mode 100644
index 000000000..656752941
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
@@ -0,0 +1,2382 @@
+;;
+;; Copyright (c) 2017-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Authors:
+;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2)
+;; (1) University of Haifa, Israel
+;; (2) Intel Corporation
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R8 R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX R10 R11
+;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31 and K1 to K7
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define STATE arg1
+%define SIZE arg2
+
+%define OFFSET rax
+
+%define IA0 arg3
+%define IA1 arg4
+%define IA2 r10
+
+%define INP0 r11
+%define INP1 r12
+%define INP2 r13
+%define INP3 r14
+%define INP4 r15
+
+%define KSOFFSET r11
+
+%define ZW0 zmm0
+%define ZW1 zmm1
+%define ZW2 zmm2
+%define ZW3 zmm3
+%define ZW4 zmm4
+%define ZW5 zmm5
+%define ZW6 zmm6
+%define ZW7 zmm7
+%define ZW8 zmm8
+%define ZW9 zmm9
+%define ZW10 zmm10
+%define ZW11 zmm11
+%define ZW12 zmm12
+%define ZW13 zmm13
+%define ZW14 zmm14
+%define ZW15 zmm15
+
+%define ZIV0 zmm16
+%define ZIV1 zmm17
+
+%define ZTMP0 zmm18
+%define ZTMP1 zmm19
+%define ZTMP2 zmm20
+%define ZTMP3 zmm21
+%define ZTMP4 zmm22
+%define ZTMP5 zmm23
+%define ZTMP6 zmm24
+%define ZTMP7 zmm25
+%define ZTMP8 zmm26
+%define ZTMP9 zmm27
+%define ZTMP10 zmm28
+%define ZTMP11 zmm29
+%define ZTMP12 zmm30
+%define ZTMP13 zmm31
+
+struc STACKFRAME
+_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_tmp_iv: resq 16 ; 2 x 64 bytes
+_tmp_in: resq 16 ; 2 x 64 bytes
+_tmp_out: resq 16 ; 2 x 64 bytes
+_tmp_mask: resd 16 ; 1 x 64 bytes
+_gpr_save: resq 4 ; r12 to r15
+_rsp_save: resq 1
+_mask_save: resq 1
+_size_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected)
+;;; ===========================================================================
+%macro CLEAR_KEY_SCHEDULE 2
+%define %%ALG %1 ; [in] DES or 3DES
+%define %%ZT %2 ; [clobbered] temporary ZMM register
+
+%ifdef SAFE_DATA
+ vpxorq %%ZT, %%ZT
+%assign rep_num (2048 / 64)
+%ifidn %%ALG, 3DES
+%assign rep_num (rep_num * 3)
+%endif
+
+%assign offset 0
+%rep rep_num
+ vmovdqa64 [rsp + _key_sched + offset], %%ZT
+%assign offset (offset + 64)
+%endrep
+
+%endif ; SAFE_DATA
+
+%endmacro
+
+;;; ===========================================================================
+;;; PERMUTE
+;;; ===========================================================================
+;;; A [in/out] - zmm register
+;;; B [in/out] - zmm register
+;;; NSHIFT [in] - constant to shift words by
+;;; MASK [in] - zmm or m512 with mask
+;;; T0 [clobbered] - temporary zmm register
+%macro PERMUTE 5
+%define %%A %1
+%define %%B %2
+%define %%NSHIFT %3
+%define %%MASK %4
+%define %%T0 %5
+
+ vpsrld %%T0, %%A, %%NSHIFT
+ vpxord %%T0, %%T0, %%B
+ vpandd %%T0, %%T0, %%MASK
+ vpxord %%B, %%B, %%T0
+ vpslld %%T0, %%T0, %%NSHIFT
+ vpxord %%A, %%A, %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; INITIAL PERMUTATION
+;;; ===========================================================================
+;;; L [in/out] - zmm register
+;;; R [in/out] - zmm register
+;;; T0 [clobbered] - temporary zmm register
+%macro IP_Z 3
+%define %%L %1
+%define %%R %2
+%define %%T0 %3
+ PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0
+ PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0
+ PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0
+ PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0
+ PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; FINAL PERMUTATION
+;;; ===========================================================================
+;;; L [in/out] - zmm register
+;;; R [in/out] - zmm register
+;;; T0 [clobbered] - temporary zmm register
+%macro FP_Z 3
+%define %%L %1
+%define %%R %2
+%define %%T0 %3
+ PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0
+ PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0
+ PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0
+ PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0
+ PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; P PHASE
+;;; ===========================================================================
+;;; W0 [in/out] - zmm register
+;;; in: vector of 16 x 32bits from S phase
+;;; out: permuted in vector
+;;; T0-T3 [clobbered] - temporary zmm register
+%macro P_PHASE 5
+%define %%W0 %1
+%define %%T0 %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+
+ vprord %%T0, %%W0, 3
+ vpandd %%T0, %%T0, [rel mask_values + 0*64]
+ vprord %%T1, %%W0, 5
+ vpandd %%T1, %%T1, [rel mask_values + 1*64]
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 24
+ vpandd %%T1, %%T1, [rel mask_values + 2*64]
+ vprord %%T2, %%W0, 26
+ vpandd %%T2, %%T2, [rel mask_values + 3*64]
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 15
+ vpandd %%T1, %%T1, [rel mask_values + 4*64]
+ vprord %%T2, %%W0, 17
+ vpandd %%T2, %%T2, [rel mask_values + 5*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 6
+ vpandd %%T2, %%T2, [rel mask_values + 6*64]
+ vprord %%T3, %%W0, 21
+ vpandd %%T3, %%T3, [rel mask_values + 7*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 12
+ vpandd %%T1, %%T1, [rel mask_values + 8*64]
+ vprord %%T2, %%W0, 14
+ vpandd %%T2, %%T2, [rel mask_values + 9*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 4
+ vpandd %%T2, %%T2, [rel mask_values + 10*64]
+ vprord %%T3, %%W0, 11
+ vpandd %%T3, %%T3, [rel mask_values + 11*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 16
+ vpandd %%T1, %%T1, [rel mask_values + 12*64]
+ vprord %%T2, %%W0, 22
+ vpandd %%T2, %%T2, [rel mask_values + 13*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 19
+ vpandd %%T2, %%T2, [rel mask_values + 14*64]
+ vprord %%T3, %%W0, 10
+ vpandd %%T3, %%T3, [rel mask_values + 15*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 9
+ vpandd %%T1, %%T1, [rel mask_values + 16*64]
+ vprord %%T2, %%W0, 13
+ vpandd %%T2, %%T2, [rel mask_values + 17*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 25
+ vpandd %%T2, %%T2, [rel mask_values + 18*64]
+ vpord %%T1, %%T1, %%T2
+ vpord %%W0, %%T0, %%T1
+%endmacro
+
+;;; ===========================================================================
+;;; E PHASE
+;;; ===========================================================================
+;;;
+;;; Expands 16x32-bit words into 16x48-bit words
+;;; plus XOR's result with the key schedule.
+;;; The output is adjusted to be friendly as S phase input.
+;;;
+;;; in [in] - zmm register
+;;; out0a [out] - zmm register
+;;; out0b [out] - zmm register
+;;; out1a [out] - zmm register
+;;; out1b [out] - zmm register
+;;; k0 [in] - key schedule; zmm or m512
+;;; k1 [in] - key schedule; zmm or m512
+;;; t0-t1 [clobbered] - temporary zmm register
+%macro E_PHASE 9
+%define %%IN %1
+%define %%OUT0A %2
+%define %%OUT0B %3
+%define %%OUT1A %4
+%define %%OUT1B %5
+%define %%K0 %6
+%define %%K1 %7
+%define %%T0 %8
+%define %%T1 %9
+
+ vprord %%T0, %%IN, 31
+ vprord %%T1, %%IN, 3
+ vpshufb %%T0, %%T0, [rel idx_e]
+ vpshufb %%T1, %%T1, [rel idx_e]
+ vpunpcklbw %%OUT0A, %%T0, %%T1
+ vpunpckhbw %%OUT1A, %%T0, %%T1
+ vpxord %%OUT0A, %%OUT0A, %%K0
+ vpxord %%OUT1A, %%OUT1A, %%K1
+ vpandd %%OUT0B, %%OUT0A, [rel and_eu]
+ vpsrlw %%OUT0B, %%OUT0B, 8
+ vpandd %%OUT0A, %%OUT0A, [rel and_ed]
+ vpandd %%OUT1B, %%OUT1A, [rel and_eu]
+ vpsrlw %%OUT1B, %%OUT1B, 8
+ vpandd %%OUT1A, %%OUT1A, [rel and_ed]
+%endmacro
+
+;;; ===========================================================================
+;;; S-BOX
+;;; ===========================================================================
+;;;
+;;; NOTE: clobbers k1-k6 OpMask registers
+;;;
+;;; IN0A [in] - zmm register; output from E-phase
+;;; IN0B [in] - zmm register; output from E-phase
+;;; IN1A [in] - zmm register; output from E-phase
+;;; IN1B [in] - zmm register; output from E-phase
+;;; OUT [out] - zmm register; output from E-phase
+;;; T0-T5 [clobbered] - temporary zmm register
+%macro S_PHASE 11
+%define %%IN0A %1
+%define %%IN0B %2
+%define %%IN1A %3
+%define %%IN1B %4
+%define %%OUT %5
+%define %%T0 %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+
+ vmovdqa64 %%T0, [rel reg_values16bit_7]
+ vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE
+ vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE
+ vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE
+ vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE
+
+ mov DWORD(IA0), 0x55555555
+ kmovd k1, DWORD(IA0)
+ mov DWORD(IA0), 0xaaaaaaaa
+ kmovd k2, DWORD(IA0)
+
+ vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64]
+ vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64]
+ vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64]
+ vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%OUT, %%T1, %%T3
+ vmovdqu16 %%OUT{k3}, %%T0
+
+ vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64]
+ vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64]
+ vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64]
+ vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T3, %%T1, %%T3
+ vmovdqu16 %%T3{k4}, %%T0
+ vpsllw %%T3, %%T3, 4
+ vpxord %%OUT, %%OUT, %%T3
+
+ vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64]
+ vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64]
+ vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64]
+ vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T4, %%T1, %%T3
+ vmovdqu16 %%T4{k5}, %%T0
+
+ vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64]
+ vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64]
+ vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64]
+ vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T5, %%T1, %%T3
+ vmovdqu16 %%T5{k6}, %%T0
+ vpsllw %%T5, %%T5, 4
+
+ vpxord %%T4, %%T4, %%T5
+ vpsllw %%T4, %%T4, 8
+ vpxord %%OUT, %%OUT, %%T4
+ vpshufb %%OUT, %%OUT, [rel shuffle_reg]
+%endmacro
+
+;;; ===========================================================================
+;;; DES encryption/decryption round
+;;; ===========================================================================
+;;;
+;;; Clobbers k1-k6 OpMask registers
+;;;
+;;; ENC_DEC [in] - ENC for encryption, DEC for decryption
+;;; R [in/out] - zmm register; plain text in & cipher text out
+;;; L [in/out] - zmm register; plain text in & cipher text out
+;;; KS [in] - pointer to the key schedule
+;;; T0-T11 [clobbered] - temporary zmm register
+%macro DES_ENC_DEC 16
+%define %%ENC_DEC %1
+%define %%R %2
+%define %%L %3
+%define %%KS %4
+%define %%T0 %5
+%define %%T1 %6
+%define %%T2 %7
+%define %%T3 %8
+%define %%T4 %9
+%define %%T5 %10
+%define %%T6 %11
+%define %%T7 %12
+%define %%T8 %13
+%define %%T9 %14
+%define %%T10 %15
+%define %%T11 %16
+
+ IP_Z %%R, %%L, %%T0
+
+%ifidn %%ENC_DEC, ENC
+ ;; ENCRYPTION
+ xor KSOFFSET, KSOFFSET
+%%_des_enc_loop:
+ E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%L, %%L, %%T0
+
+ E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%R, %%R, %%T0
+
+ add KSOFFSET, (4*64)
+ cmp KSOFFSET, (8*(4*64))
+ jb %%_des_enc_loop
+
+%else
+ ;; DECRYPTION
+ mov KSOFFSET, (8*(4*64))
+%%_des_dec_loop:
+ E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%L, %%L, %%T0
+
+ E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%R, %%R, %%T0
+ sub KSOFFSET, (4*64)
+ jnz %%_des_dec_loop
+%endif ; DECRYPTION
+
+ FP_Z %%R, %%L, %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION AT DATA INPUT
+;;; ===========================================================================
+;;;
+;;; IN00 - IN15 [in/out]:
+;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
+;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K5 [clobbered] - temporary zmm registers
+;;; H0-H3 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_IN 30
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T1 %18
+%define %%T2 %19
+%define %%T3 %20
+%define %%K0 %21
+%define %%K1 %22
+%define %%K2 %23
+%define %%K3 %24
+%define %%K4 %25
+%define %%K5 %26
+%define %%H0 %27
+%define %%H1 %28
+%define %%H2 %29
+%define %%H3 %30
+
+ vpunpckldq %%K0, %%IN00, %%IN01
+ vpunpckhdq %%K1, %%IN00, %%IN01
+ vpunpckldq %%T0, %%IN02, %%IN03
+ vpunpckhdq %%T1, %%IN02, %%IN03
+
+ vpunpckldq %%IN00, %%IN04, %%IN05
+ vpunpckhdq %%IN01, %%IN04, %%IN05
+ vpunpckldq %%IN02, %%IN06, %%IN07
+ vpunpckhdq %%IN03, %%IN06, %%IN07
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T1
+ vpunpckhqdq %%T3, %%K1, %%T1
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+ vpunpcklqdq %%T0, %%IN01, %%IN03
+ vpunpckhqdq %%T1, %%IN01, %%IN03
+
+ vpunpckldq %%K4, %%IN08, %%IN09
+ vpunpckhdq %%K5, %%IN08, %%IN09
+ vpunpckldq %%IN04, %%IN10, %%IN11
+ vpunpckhdq %%IN05, %%IN10, %%IN11
+ vpunpckldq %%IN06, %%IN12, %%IN13
+ vpunpckhdq %%IN07, %%IN12, %%IN13
+ vpunpckldq %%IN10, %%IN14, %%IN15
+ vpunpckhdq %%IN11, %%IN14, %%IN15
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN14, %%K5, %%IN05
+ vpunpckhqdq %%IN15, %%K5, %%IN05
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+ vpunpcklqdq %%IN02, %%IN07, %%IN11
+ vpunpckhqdq %%IN03, %%IN07, %%IN11
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H1, %%K2, %%K0, 0xee
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H1, %%T2, %%K1, 0xee
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
+ vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
+ vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T1, 0x44
+ vshufi64x2 %%H1, %%T3, %%T1, 0xee
+ vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
+ vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
+ vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION AT DATA OUTPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
+;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
+;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K5 [clobbered] - temporary zmm registers
+;;; H0-H3 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_OUT 30
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T1 %18
+%define %%T2 %19
+%define %%T3 %20
+%define %%K0 %21
+%define %%K1 %22
+%define %%K2 %23
+%define %%K3 %24
+%define %%K4 %25
+%define %%K5 %26
+%define %%H0 %27
+%define %%H1 %28
+%define %%H2 %29
+%define %%H3 %30
+
+ vpunpckldq %%K0, %%IN01, %%IN00
+ vpunpckhdq %%K1, %%IN01, %%IN00
+ vpunpckldq %%T0, %%IN03, %%IN02
+ vpunpckhdq %%T1, %%IN03, %%IN02
+
+ vpunpckldq %%IN00, %%IN05, %%IN04
+ vpunpckhdq %%IN01, %%IN05, %%IN04
+ vpunpckldq %%IN02, %%IN07, %%IN06
+ vpunpckhdq %%IN03, %%IN07, %%IN06
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T1
+ vpunpckhqdq %%T3, %%K1, %%T1
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+ vpunpcklqdq %%T0, %%IN01, %%IN03
+ vpunpckhqdq %%T1, %%IN01, %%IN03
+
+ vpunpckldq %%K4, %%IN09, %%IN08
+ vpunpckhdq %%K5, %%IN09, %%IN08
+ vpunpckldq %%IN04, %%IN11, %%IN10
+ vpunpckhdq %%IN05, %%IN11, %%IN10
+ vpunpckldq %%IN06, %%IN13, %%IN12
+ vpunpckhdq %%IN07, %%IN13, %%IN12
+ vpunpckldq %%IN10, %%IN15, %%IN14
+ vpunpckhdq %%IN11, %%IN15, %%IN14
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN14, %%K5, %%IN05
+ vpunpckhqdq %%IN15, %%K5, %%IN05
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+ vpunpcklqdq %%IN02, %%IN07, %%IN11
+ vpunpckhqdq %%IN03, %%IN07, %%IN11
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H1, %%K2, %%K0, 0xee
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H1, %%T2, %%K1, 0xee
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
+ vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
+ vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T1, 0x44
+ vshufi64x2 %%H1, %%T3, %%T1, 0xee
+ vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
+ vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
+ vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 / R0/L0-R7/L7 [in/out]:
+;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
+;;; out: R0 - 16 x word0, L0 - 16 x word1
+;;; T0,T2 [clobbered] - temporary zmm registers
+;;; K0-K4 [clobbered] - temporary zmm registers
+;;; H0,H2 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_IN_ONE 24
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T2 %18
+%define %%K0 %19
+%define %%K1 %20
+%define %%K2 %21
+%define %%K4 %22
+%define %%H0 %23
+%define %%H2 %24
+
+ vpunpckldq %%K0, %%IN00, %%IN01
+ vpunpckhdq %%K1, %%IN00, %%IN01
+ vpunpckldq %%T0, %%IN02, %%IN03
+
+ vpunpckldq %%IN00, %%IN04, %%IN05
+ vpunpckhdq %%IN01, %%IN04, %%IN05
+ vpunpckldq %%IN02, %%IN06, %%IN07
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+
+ vpunpckldq %%K4, %%IN08, %%IN09
+ vpunpckldq %%IN04, %%IN10, %%IN11
+ vpunpckldq %%IN06, %%IN12, %%IN13
+ vpunpckldq %%IN10, %%IN14, %%IN15
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
+;;; in: R0 - 16 x word0, L0 - 16 x word1
+;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K3 [clobbered] - temporary zmm registers
+;;; H0,H1 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_OUT_ONE 25
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T2 %18
+%define %%T3 %19
+%define %%K0 %20
+%define %%K1 %21
+%define %%K2 %22
+%define %%K3 %23
+%define %%H0 %24
+%define %%H1 %25
+
+ vpxord %%T0, %%T0, %%T0
+
+ vpunpckldq %%K0, %%IN01, %%IN00
+ vpunpckhdq %%K1, %%IN01, %%IN00
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T0
+ vpunpckhqdq %%T3, %%K1, %%T0
+
+ vshufi64x2 %%H0, %%K2, %%T0, 0x44
+ vshufi64x2 %%H1, %%K2, %%T0, 0xee
+ vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%T0, 0x44
+ vshufi64x2 %%H1, %%T2, %%T0, 0xee
+ vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T0, 0x44
+ vshufi64x2 %%H1, %%T3, %%T0, 0xee
+ vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DES INITIALIZATION
+;;; key schedule transposition and IV set up
+;;; ===========================================================================
+;;;
+;;; STATE_KEYS [in] - KEYS in DES OOO STATE
+;;; STATE_IV [ in] - IV in DES OOO STATE
+;;; KS [out] - place to store transposed key schedule or NULL
+;;; IV0 [out] - r512; initialization vector
+;;; IV1 [out] - r512; initialization vector
+;;; T0-T27 [clobbered] - temporary r512
+%macro DES_INIT 33
+%define %%STATE_KEYS %1
+%define %%STATE_IV %2
+%define %%KS %3
+%define %%IV0 %4
+%define %%IV1 %5
+%define %%T0 %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%T6 %12
+%define %%T7 %13
+%define %%T8 %14
+%define %%T9 %15
+%define %%T10 %16
+%define %%T11 %17
+%define %%T12 %18
+%define %%T13 %19
+%define %%T14 %20
+%define %%T15 %21
+%define %%T16 %22
+%define %%T17 %23
+%define %%T18 %24
+%define %%T19 %25
+%define %%T20 %26
+%define %%T21 %27
+%define %%T22 %28
+%define %%T23 %29
+%define %%T24 %30
+%define %%T25 %31
+%define %%T26 %32
+%define %%T27 %33
+
+ ;; set up the key schedule
+ ;; - load first half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+ ;; - load second half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0 + 64]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+
+ ;; set up IV
+ ;; - they are already kept transposed so this is enough to load them
+ vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
+ vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES INITIALIZATION
+;;; key schedule transposition and IV set up
+;;; ===========================================================================
+;;;
+;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE
+;;; STATE_IV [ in] - IV in 3DES OOO STATE
+;;; KS1 [out] - place to store transposed key schedule or NULL
+;;; KS2 [out] - place to store transposed key schedule or NULL
+;;; KS3 [out] - place to store transposed key schedule or NULL
+;;; IV0 [out] - r512; initialization vector
+;;; IV1 [out] - r512; initialization vector
+;;; T0-T27 [clobbered] - temporary r512
+;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec)
+%macro DES3_INIT 36
+%define %%STATE_KEYS %1
+%define %%STATE_IV %2
+%define %%KS1 %3
+%define %%KS2 %4
+%define %%KS3 %5
+%define %%IV0 %6
+%define %%IV1 %7
+%define %%T0 %8
+%define %%T1 %9
+%define %%T2 %10
+%define %%T3 %11
+%define %%T4 %12
+%define %%T5 %13
+%define %%T6 %14
+%define %%T7 %15
+%define %%T8 %16
+%define %%T9 %17
+%define %%T10 %18
+%define %%T11 %19
+%define %%T12 %20
+%define %%T13 %21
+%define %%T14 %22
+%define %%T15 %23
+%define %%T16 %24
+%define %%T17 %25
+%define %%T18 %26
+%define %%T19 %27
+%define %%T20 %28
+%define %%T21 %29
+%define %%T22 %30
+%define %%T23 %31
+%define %%T24 %32
+%define %%T25 %33
+%define %%T26 %34
+%define %%T27 %35
+%define %%DIR %36
+
+%ifidn %%DIR, ENC
+%assign KEY_IDX 0
+%else
+%assign KEY_IDX 2
+%endif
+%assign KS_IDX 1
+
+%rep 3
+ ;; set up the key schedule
+ ;; - load first half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+ ;; - load second half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0 + 64]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+
+%ifidn %%DIR, ENC
+%assign KEY_IDX (KEY_IDX + 1)
+%else
+%assign KEY_IDX (KEY_IDX - 1)
+%endif
+%assign KS_IDX (KS_IDX + 1)
+%endrep ; KEY_IDX / KS_IDX
+
+ ;; set up IV
+ ;; - they are already kept transposed so this is enough to load them
+ vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
+ vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
+
+%endmacro
+
+;;; ===========================================================================
+;;; DES FINISH
+;;; Update in/out pointers and store IV
+;;; ===========================================================================
+;;;
+;;; Needs: STATE & SIZE
+;;; IV0 [in] - r512; initialization vector
+;;; IV1 [in] - r512; initialization vector
+;;; T0-T4 [clobbered] - temporary r512 registers
+%macro DES_FINISH 7
+%define %%IV0 %1
+%define %%IV1 %2
+%define %%T0 %3
+%define %%T1 %4
+%define %%T2 %5
+%define %%T3 %6
+%define %%T4 %7
+
+ vpbroadcastq %%T4, SIZE
+ vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)]
+ vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)]
+ vpaddq %%T0, %%T0, %%T4
+ vpaddq %%T1, %%T1, %%T4
+ vpaddq %%T2, %%T2, %%T4
+ vpaddq %%T3, %%T3, %%T4
+ vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0
+ vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1
+ vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2
+ vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3
+
+ vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0
+ vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1
+%endmacro
+
+;;; ===========================================================================
+;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY
+;;; ===========================================================================
+;;;
+;;; Needs: STATE, IA0-IA2
+;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection
+;;; KS [in] - key schedule
+;;; T0-T24 [clobbered] - temporary r512
+;;; T_IN [in] - 16 * 8 byte storage
+;;; T_OUT [in] - 16 * 8 byte storage
+;;; T_MASK [in] - 16 * 4 byte storage
+;;; T_IV [in] - 16 * 8 byte storage
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro DES_CFB_ONE 31
+%define %%ENC_DEC %1
+%define %%KS %2
+%define %%T0 %3
+%define %%T1 %4
+%define %%T2 %5
+%define %%T3 %6
+%define %%T4 %7
+%define %%T5 %8
+%define %%T6 %9
+%define %%T7 %10
+%define %%T8 %11
+%define %%T9 %12
+%define %%T10 %13
+%define %%T11 %14
+%define %%T12 %15
+%define %%T13 %16
+%define %%T14 %17
+%define %%T15 %18
+%define %%T16 %19
+%define %%T17 %20
+%define %%T18 %21
+%define %%T19 %22
+%define %%T20 %23
+%define %%T21 %24
+%define %%T22 %25
+%define %%T23 %26
+%define %%T24 %27
+%define %%T_IN %28
+%define %%T_OUT %29
+%define %%T_IV %30
+%define %%T_MASK %31
+
+ ;; - find mask for non-zero partial lengths
+ vpxord %%T10, %%T10, %%T10
+ vmovdqu64 %%T0, [STATE + _des_args_PLen]
+ vpcmpd k3, %%T0, %%T10, 4 ; NEQ
+ kmovw DWORD(IA0), k3
+ movzx DWORD(IA0), WORD(IA0)
+ or DWORD(IA0), DWORD(IA0)
+ jz %%_des_cfb_one_end ; no non-zero partial lengths
+
+%ifidn %%ENC_DEC, ENC
+ ;; For encyrption case we need to make sure that
+ ;; all full blocks are complete before proceeding
+ ;; with CFB partial block.
+ ;; To do that current out position is compared against
+ ;; calculated last full block position.
+ vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)]
+ vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)]
+ vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)]
+ vpcmpq k4, %%T1, %%T2, 0 ; EQ
+ vpcmpq k5, %%T3, %%T4, 0 ; EQ
+ kmovw DWORD(IA1), k4
+ movzx DWORD(IA1), BYTE(IA1)
+ kmovw DWORD(IA2), k5
+ movzx DWORD(IA2), BYTE(IA2)
+ shl DWORD(IA2), 8
+ or DWORD(IA2), DWORD(IA1)
+ and DWORD(IA0), DWORD(IA2)
+ jz %%_des_cfb_one_end ; no non-zero lengths left
+ kmovw k3, DWORD(IA0)
+%endif
+ ;; Calculate ((1 << partial_bytes) - 1)
+ ;; in order to get the mask for loads and stores
+ ;; k3 & IA0 - hold valid mask
+ vmovdqa64 %%T1, [rel vec_ones_32b]
+ vpsllvd %%T2{k3}{z}, %%T1, %%T0
+ vpsubd %%T2{k3}{z}, %%T2, %%T1
+ vmovdqu64 [%%T_MASK], %%T2
+
+ ;; clear selected partial lens not to do them twice
+ vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10
+
+ ;; copy IV, in and out pointers
+ vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)]
+ vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)]
+ vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)]
+ vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)]
+ vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1
+ vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2
+ vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3
+ vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4
+ vmovdqu64 [%%T_IV + (0*64)], %%T5
+ vmovdqu64 [%%T_IV + (1*64)], %%T6
+
+ ;; calculate last block case mask
+ ;; - first block case requires no modifications to in/out/IV
+ vmovdqu64 %%T1, [STATE + _des_args_BLen]
+ vpcmpd k2, %%T1, %%T10, 4 ; NEQ
+ kmovw DWORD(IA1), k2
+ and DWORD(IA1), DWORD(IA0)
+ jz %%_des_cfb_one_no_last_blocks
+
+ ;; set up IV, in and out for the last block case
+ ;; - Last block needs in and out to be set differently (decryption only)
+ ;; - IA1 holds the last block mask
+%ifidn %%ENC_DEC, DEC
+ mov DWORD(IA0), DWORD(IA1)
+ mov DWORD(IA2), DWORD(IA1)
+ shr DWORD(IA1), 8
+ and DWORD(IA2), 0xff
+ kmovw k4, DWORD(IA2)
+ kmovw k5, DWORD(IA1)
+ vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)]
+ vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)]
+ vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1
+ vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2
+ vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3
+ vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4
+%endif ; decryption
+ ;; - IV has to be set differently for CFB as well
+ ;; - IA0 holds the last block mask
+%assign IDX 0
+%rep 16
+ test DWORD(IA0), (1 << IDX)
+ jz %%_des_cfb_one_copy_iv_next %+ IDX
+%ifidn %%ENC_DEC, ENC
+ mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)]
+%else
+ mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)]
+%endif
+ mov IA2, [IA2 - 8]
+ mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2)
+ shr IA2, 32
+ mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2)
+%%_des_cfb_one_copy_iv_next %+ IDX:
+%assign IDX (IDX + 1)
+%endrep
+
+%%_des_cfb_one_no_last_blocks:
+ ;; Uffff ... finally let's do some DES CFB
+ ;; - let's use T_IN, T_OUT, T_IV and T_MASK
+
+ ;; - load data with the corresponding masks & transpose
+ ;; - T0 to T15 will hold the data
+ xor IA0, IA0
+%assign IDX 0
+%assign K_IDX 1
+%rep 16
+ mov IA1, [%%T_IN + (IDX*PTR_SZ)]
+ mov DWORD(IA0), [%%T_MASK + (IDX*4)]
+ kmovq k %+ K_IDX, IA0
+ vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1]
+%assign IDX (IDX + 1)
+%assign K_IDX (K_IDX + 1)
+%if K_IDX > 7
+%assign K_IDX 1 ; iterate through K1 to K7
+%endif
+%endrep
+ ;; - transpose the data in T0 to T15, T16 to T23 are clobbered
+ TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23
+
+ ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1
+ vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0
+ vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1
+ ;; DES encrypt
+ ;; - R0 - %%T0
+ ;; - L0 - %%T1
+ DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13
+ ;; CFB style xor with R0/L0 with IV
+ ;; - IV0 - %%T16
+ ;; - IV1 - %%T17
+ vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1
+ vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0
+ vmovdqa64 %%T1, %%T2
+ ;; - new R0 = L0 ^ IV0 (%%T0)
+ ;; - new L0 = R0 ^ IV1 (%%T1)
+
+ ;; Transpose the data out
+ ;; - %%T2 to %%T24 clobbered
+ TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24
+
+ ;; Store the transposed data
+ ;; - T0 to T15 will hold the data
+ xor IA0, IA0
+%assign IDX 0
+%assign K_IDX 1
+%rep 16
+ mov IA1, [%%T_OUT + (IDX*PTR_SZ)]
+ mov DWORD(IA0), [%%T_MASK + (IDX*4)]
+ kmovq k %+ K_IDX, IA0
+ vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX
+%assign IDX (IDX + 1)
+%assign K_IDX (K_IDX + 1)
+%if K_IDX > 7
+%assign K_IDX 1 ; iterate through K1 to K7
+%endif
+%endrep
+
+%ifdef SAFE_DATA
+ ;; Clear copied IV's
+ vpxorq %%T5, %%T5
+ vmovdqu64 [%%T_IV + (0*64)], %%T5
+ vmovdqu64 [%%T_IV + (1*64)], %%T5
+%endif
+
+%%_des_cfb_one_end:
+
+%endmacro
+
+;;; ===========================================================================
+;;; Converts length into mask of DES blocks
+;;; ===========================================================================
+;;;
+;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64)
+;;; USES: IA0, IA1 IA2
+;;; ASSUMES: SIZE - OFFSET < 64
+%macro GET_MASK8 1
+%define %%MASK %1
+
+%ifidn IA1, rcx
+%define myrcx IA1
+%else
+%define myrcx rcx
+ mov IA1, rcx
+%endif
+ mov myrcx, SIZE
+ sub myrcx, OFFSET
+ ;; - myrcx - remaining length
+ ;; - divide by 8 (DES block size)
+ ;; - create bit mask of the result
+ mov DWORD(%%MASK), 1
+ shr DWORD(myrcx), 3
+ shl DWORD(%%MASK), BYTE(myrcx)
+ sub DWORD(%%MASK), 1
+%ifnidn IA1, rcx
+ mov rcx, IA1
+%endif
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS [in] - pointer to transposed key schedule
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_DES_ENC_CIPHER 2
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS %2
+
+%assign RN 0
+%assign LN 1
+%assign RNN 2
+%assign LNN 3
+%rep %%NUM_DES_BLOCKS - 1
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
+ vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%assign RNN (RNN + 2)
+%assign LNN (LNN + 2)
+%endrep
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
+ vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS [in] - pointer to transposed key schedule
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_DES_DEC_CIPHER 2
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS %2
+
+%assign RN 0
+%assign LN 1
+%rep %%NUM_DES_BLOCKS
+ vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
+ vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
+ vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
+ vmovdqa64 ZIV0, ZTMP12
+ vmovdqa64 ZIV1, ZTMP13
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%endrep
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS1 [in] - pointer to transposed key schedule 1
+;;; DES_KS2 [in] - pointer to transposed key schedule 2
+;;; DES_KS3 [in] - pointer to transposed key schedule 3
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_3DES_ENC_CIPHER 4
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS1 %2
+%define %%DES_KS2 %3
+%define %%DES_KS3 %4
+
+%assign RN 0
+%assign LN 1
+%assign RNN 2
+%assign LNN 3
+%rep %%NUM_DES_BLOCKS
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%if (RNN < (%%NUM_DES_BLOCKS * 2))
+ vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
+ vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
+%else
+ vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
+ vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
+%endif
+
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%assign RNN (RNN + 2)
+%assign LNN (LNN + 2)
+%endrep
+
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS1 [in] - pointer to transposed key schedule 1
+;;; DES_KS2 [in] - pointer to transposed key schedule 2
+;;; DES_KS3 [in] - pointer to transposed key schedule 3
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_3DES_DEC_CIPHER 4
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS1 %2
+%define %%DES_KS2 %3
+%define %%DES_KS3 %4
+
+%assign RN 0
+%assign LN 1
+%rep %%NUM_DES_BLOCKS
+ vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
+ vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
+ vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
+ vmovdqa64 ZIV0, ZTMP12
+ vmovdqa64 ZIV1, ZTMP13
+
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%endrep
+
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC / DOCSIS DES ENCRYPT
+;;; ===========================================================================
+;;;
+;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
+;;; 3DES (3DES CBC)
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro GENERIC_DES_ENC 1
+%define %%DES_DOCSIS %1
+
+ ;; push the registers and allocate the stack frame
+ mov rax, rsp
+ sub rsp, STACKFRAME_size
+ and rsp, -64
+ mov [rsp + _rsp_save], rax ; original SP
+ mov [rsp + _gpr_save + 0*8], r12
+ mov [rsp + _gpr_save + 1*8], r13
+ mov [rsp + _gpr_save + 2*8], r14
+ mov [rsp + _gpr_save + 3*8], r15
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES and DOCSIS DES
+ DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%else
+ ;; 3DES
+ DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC
+%endif
+ mov [rsp + _size_save], SIZE
+ and SIZE, -64
+ xor OFFSET, OFFSET
+ ;; This loop processes message in blocks of 64 bytes.
+ ;; Anything smaller than 64 bytes is handled separately after the loop.
+%%_gen_des_enc_loop:
+ cmp OFFSET, SIZE
+ jz %%_gen_des_enc_loop_end
+ ;; run loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0, [IA0 + OFFSET]
+ vmovdqu64 ZW1, [IA1 + OFFSET]
+ vmovdqu64 ZW2, [IA2 + OFFSET]
+ vmovdqu64 ZW3, [INP0 + OFFSET]
+ vmovdqu64 ZW4, [INP1 + OFFSET]
+ vmovdqu64 ZW5, [INP2 + OFFSET]
+ vmovdqu64 ZW6, [INP3 + OFFSET]
+ vmovdqu64 ZW7, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8, [IA0 + OFFSET]
+ vmovdqu64 ZW9, [IA1 + OFFSET]
+ vmovdqu64 ZW10, [IA2 + OFFSET]
+ vmovdqu64 ZW11, [INP0 + OFFSET]
+ vmovdqu64 ZW12, [INP1 + OFFSET]
+ vmovdqu64 ZW13, [INP2 + OFFSET]
+ vmovdqu64 ZW14, [INP3 + OFFSET]
+ vmovdqu64 ZW15, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC ENC comes here
+ vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
+ vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
+
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 8, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+ ;; run stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW0
+ vmovdqu64 [IA1 + OFFSET], ZW1
+ vmovdqu64 [IA2 + OFFSET], ZW2
+ vmovdqu64 [INP0 + OFFSET], ZW3
+ vmovdqu64 [INP1 + OFFSET], ZW4
+ vmovdqu64 [INP2 + OFFSET], ZW5
+ vmovdqu64 [INP3 + OFFSET], ZW6
+ vmovdqu64 [INP4 + OFFSET], ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW8
+ vmovdqu64 [IA1 + OFFSET], ZW9
+ vmovdqu64 [IA2 + OFFSET], ZW10
+ vmovdqu64 [INP0 + OFFSET], ZW11
+ vmovdqu64 [INP1 + OFFSET], ZW12
+ vmovdqu64 [INP2 + OFFSET], ZW13
+ vmovdqu64 [INP3 + OFFSET], ZW14
+ vmovdqu64 [INP4 + OFFSET], ZW15
+
+ add OFFSET, 64
+ jmp %%_gen_des_enc_loop
+%%_gen_des_enc_loop_end:
+ ;; This is where we check if there is anything less than 64 bytes
+ ;; of message left for processing.
+ mov SIZE, [rsp + _size_save]
+ cmp OFFSET, SIZE
+ jz %%_gen_des_enc_part_end
+ ;; calculate min of bytes_left and 64, convert to qword mask
+ GET_MASK8 IA0 ; IA0 = mask
+
+ kmovw k7, DWORD(IA0)
+ mov [rsp + _mask_save], IA0
+ ;; run masked loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC ENC comes here
+ vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
+ vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
+
+ mov IA0, [rsp + _mask_save]
+ cmp BYTE(IA0), 0x0f
+ ja %%_gt_4
+ jz %%_blocks_4
+
+ cmp BYTE(IA0), 0x03
+ ja %%_blocks_3
+ jz %%_blocks_2
+
+ ;; process one block and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 1, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_2:
+ ;; process two blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 2, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_3:
+ ;; process three blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 3, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_4:
+ ;; process four blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 4, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_gt_4:
+ cmp BYTE(IA0), 0x3f
+ ja %%_blocks_7
+ jz %%_blocks_6
+%%_blocks_5:
+ ;; process five blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 5, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_6:
+ ;; process six blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 6, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_7:
+ ;; process seven blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 7, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+%%_transpose_out:
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run masked stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
+%%_gen_des_enc_part_end:
+
+ ;; store IV and update pointers
+ DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
+
+ ;; CFB part for DOCSIS
+%ifidn %%DES_DOCSIS, DOCSIS
+ DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
+%endif
+
+ CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
+
+ ;; restore stack pointer and registers
+ mov r12, [rsp + _gpr_save + 0*8]
+ mov r13, [rsp + _gpr_save + 1*8]
+ mov r14, [rsp + _gpr_save + 2*8]
+ mov r15, [rsp + _gpr_save + 3*8]
+ mov rsp, [rsp + _rsp_save] ; original SP
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC / DOCSIS DES DECRYPT
+;;; ===========================================================================
+;;;
+;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
+;;; 3DES (3DES CBC)
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro GENERIC_DES_DEC 1
+%define %%DES_DOCSIS %1
+
+ ;; push the registers and allocate the stack frame
+ mov rax, rsp
+ sub rsp, STACKFRAME_size
+ and rsp, -64
+ mov [rsp + _rsp_save], rax ; original SP
+ mov [rsp + _gpr_save + 0*8], r12
+ mov [rsp + _gpr_save + 1*8], r13
+ mov [rsp + _gpr_save + 2*8], r14
+ mov [rsp + _gpr_save + 3*8], r15
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES and DOCSIS
+ DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%else
+ ;; 3DES
+ DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC
+%endif
+
+ ;; CFB part for DOCSIS
+%ifidn %%DES_DOCSIS, DOCSIS
+ DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
+%endif
+
+ mov [rsp + _size_save], SIZE
+ and SIZE, -64
+ xor OFFSET, OFFSET
+ ;; This loop processes message in blocks of 64 bytes.
+ ;; Anything smaller than 64 bytes is handled separately after the loop.
+%%_gen_des_dec_loop:
+ cmp OFFSET, SIZE
+ jz %%_gen_des_dec_loop_end
+ ;; run loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0, [IA0 + OFFSET]
+ vmovdqu64 ZW1, [IA1 + OFFSET]
+ vmovdqu64 ZW2, [IA2 + OFFSET]
+ vmovdqu64 ZW3, [INP0 + OFFSET]
+ vmovdqu64 ZW4, [INP1 + OFFSET]
+ vmovdqu64 ZW5, [INP2 + OFFSET]
+ vmovdqu64 ZW6, [INP3 + OFFSET]
+ vmovdqu64 ZW7, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8, [IA0 + OFFSET]
+ vmovdqu64 ZW9, [IA1 + OFFSET]
+ vmovdqu64 ZW10, [IA2 + OFFSET]
+ vmovdqu64 ZW11, [INP0 + OFFSET]
+ vmovdqu64 ZW12, [INP1 + OFFSET]
+ vmovdqu64 ZW13, [INP2 + OFFSET]
+ vmovdqu64 ZW14, [INP3 + OFFSET]
+ vmovdqu64 ZW15, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES CBC DEC comes here
+ GEN_DES_DEC_CIPHER 8, rsp + _key_sched
+%else
+ ;; 3DES CBC DEC comes here
+ GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW0
+ vmovdqu64 [IA1 + OFFSET], ZW1
+ vmovdqu64 [IA2 + OFFSET], ZW2
+ vmovdqu64 [INP0 + OFFSET], ZW3
+ vmovdqu64 [INP1 + OFFSET], ZW4
+ vmovdqu64 [INP2 + OFFSET], ZW5
+ vmovdqu64 [INP3 + OFFSET], ZW6
+ vmovdqu64 [INP4 + OFFSET], ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW8
+ vmovdqu64 [IA1 + OFFSET], ZW9
+ vmovdqu64 [IA2 + OFFSET], ZW10
+ vmovdqu64 [INP0 + OFFSET], ZW11
+ vmovdqu64 [INP1 + OFFSET], ZW12
+ vmovdqu64 [INP2 + OFFSET], ZW13
+ vmovdqu64 [INP3 + OFFSET], ZW14
+ vmovdqu64 [INP4 + OFFSET], ZW15
+
+ add OFFSET, 64
+ jmp %%_gen_des_dec_loop
+%%_gen_des_dec_loop_end:
+ ;; This is where we check if there is anything less than 64 bytes
+ ;; of message left for processing.
+ mov SIZE, [rsp + _size_save]
+ cmp OFFSET, SIZE
+ jz %%_gen_des_dec_part_end
+ ;; calculate min of bytes_left and 64, convert to qword mask
+ GET_MASK8 IA0 ; IA0 = mask
+
+ kmovw k7, DWORD(IA0)
+ mov [rsp + _mask_save], IA0
+ ;; run masked loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC DEC comes here
+ mov IA0, [rsp + _mask_save]
+ cmp BYTE(IA0), 0x0f
+ ja %%_gt_4
+ jz %%_blocks_4
+
+ cmp BYTE(IA0), 0x03
+ ja %%_blocks_3
+ jz %%_blocks_2
+ ;; process one block and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 1, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_2:
+ ;; process two blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 2, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_3:
+ ;; process three blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 3, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_4:
+ ;; process four blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 4, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_gt_4:
+ cmp BYTE(IA0), 0x3f
+ ja %%_blocks_7
+ jz %%_blocks_6
+%%_blocks_5:
+ ;; process five blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 5, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_6:
+ ;; process six blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 6, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_7:
+ ;; process seven blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 7, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+%%_transpose_out:
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run masked stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
+%%_gen_des_dec_part_end:
+
+ ;; store IV and update pointers
+ DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
+
+ CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
+
+ ;; restore stack pointer and registers
+ mov r12, [rsp + _gpr_save + 0*8]
+ mov r13, [rsp + _gpr_save + 1*8]
+ mov r14, [rsp + _gpr_save + 2*8]
+ mov r15, [rsp + _gpr_save + 3*8]
+ mov rsp, [rsp + _rsp_save] ; original SP
+%endmacro
+
+
+;;; ========================================================
+;;; DATA
+
+section .data
+default rel
+align 64
+mask_values:
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+
+align 64
+init_perm_consts:
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+
+;;; S-Box table
+align 64
+S_box_flipped:
+ ;; SBOX0
+ dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c
+ dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a
+ dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05
+ dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00
+ dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05
+ dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06
+ dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00
+ dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b
+ ;; SBOX1
+ dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09
+ dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04
+ dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c
+ dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f
+ dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00
+ dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07
+ dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a
+ dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09
+ ;; SBOX2
+ dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a
+ dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07
+ dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05
+ dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e
+ dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d
+ dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04
+ dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a
+ dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03
+ ;; SBOX3
+ dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a
+ dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01
+ dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04
+ dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02
+ dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03
+ dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04
+ dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e
+ dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07
+ ;; SBOX4
+ dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06
+ dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00
+ dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c
+ dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07
+ dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05
+ dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a
+ dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02
+ dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c
+ ;; SBOX5
+ dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08
+ dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d
+ dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b
+ dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06
+ dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06
+ dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01
+ dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00
+ dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b
+ ;; SBOX6
+ dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00
+ dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09
+ dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a
+ dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04
+ dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07
+ dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c
+ dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04
+ dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03
+ ;; SBOX7
+ dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f
+ dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a
+ dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c
+ dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01
+ dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c
+ dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06
+ dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a
+ dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d
+
+;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1
+align 64
+vec_ones_32b:
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+align 64
+and_eu:
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+
+align 64
+and_ed:
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+
+align 64
+idx_e:
+ dq 0x0d0c090805040100, 0x0f0e0b0a07060302
+ dq 0x1d1c191815141110, 0x1f1e1b1a17161312
+ dq 0x2d2c292825242120, 0x2f2e2b2a27262322
+ dq 0x3d3c393835343130, 0x3f3e3b3a37363332
+
+align 64
+reg_values16bit_7:
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+
+align 64
+shuffle_reg:
+ dq 0x0705060403010200, 0x0f0d0e0c0b090a08
+ dq 0x1715161413111210, 0x1f1d1e1c1b191a18
+ dq 0x2725262423212220, 0x2f2d2e2c2b292a28
+ dq 0x3735363433313230, 0x3f3d3e3c3b393a38
+
+;;; ========================================================
+;;; CODE
+section .text
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des_x16_cbc_enc_avx512,function,internal)
+des_x16_cbc_enc_avx512:
+ GENERIC_DES_ENC DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des_x16_cbc_dec_avx512,function,internal)
+des_x16_cbc_dec_avx512:
+ GENERIC_DES_DEC DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal)
+des3_x16_cbc_enc_avx512:
+ GENERIC_DES_ENC 3DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal)
+des3_x16_cbc_dec_avx512:
+ GENERIC_DES_DEC 3DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(docsis_des_x16_enc_avx512,function,internal)
+docsis_des_x16_enc_avx512:
+ GENERIC_DES_ENC DOCSIS
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(docsis_des_x16_dec_avx512,function,internal)
+docsis_des_x16_dec_avx512:
+ GENERIC_DES_DEC DOCSIS
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm
new file mode 100644
index 000000000..f9f643b40
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm
new file mode 100644
index 000000000..2465b22dd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm
new file mode 100644
index 000000000..403ab2f7c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm
new file mode 100644
index 000000000..348190a2a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm
new file mode 100644
index 000000000..141b4b9ca
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm
new file mode 100644
index 000000000..4daa1b361
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm
new file mode 100644
index 000000000..db940ffe9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm
@@ -0,0 +1,3536 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+; Tomasz Kantecki
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_avx2_avx512.asm"
+
+%include "mb_mgr_datastruct.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx512
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx512
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx512
+%endif
+
+section .text
+default rel
+
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+%define LOCAL_STORAGE 16*7
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 4
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%TMP1 %4
+
+ lea %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+ add %%TMP1, %%LENGTH
+ add %%TMP1, %%LENGTH
+ kmovw k1, [%%TMP1]
+%else
+ kmovw k1, [%%TMP1 + %%LENGTH*2]
+%endif
+ vmovdqu8 XWORD(%%OUTPUT){k1}{z}, [%%INPUT]
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 13
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%GDATA_KEY %4
+%define %%XTMP0 %5 ; xmm temp reg 5
+%define %%XTMP1 %6 ; xmm temp reg 5
+%define %%XTMP2 %7
+%define %%XTMP3 %8
+%define %%XTMP4 %9
+%define %%XTMP5 %10 ; xmm temp reg 5
+%define %%T1 %11 ; temp reg 1
+%define %%T2 %12
+%define %%T3 %13
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu %%XTMP0, [%%T1 + 16*0]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+ vmovdqu %%XTMP0, [%%T1 + 16*i]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+ ;; calculate hash_key position to start with
+ mov %%T3, %%T2
+ and %%T3, -16 ; 1 to 7 blocks possible here
+ neg %%T3
+ add %%T3, HashKey_1 + 16
+ lea %%T3, [%%GDATA_KEY + %%T3]
+
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16 ; move to next data block
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+
+%%_AAD_blocks:
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+ jmp %%_AAD_blocks
+
+%%_AAD_reduce:
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ or %%T2, %%T2
+ je %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+ vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax
+
+%%_data_read: ;Finished reading in data
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [rel SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+%endif
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask:
+
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+%ifidn %%ENC_DEC, DEC
+ vpand xmm3, xmm1
+ vpshufb xmm3, [rel SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+%else
+ vpshufb xmm9, [rel SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+%endif
+ cmp r15,0
+ jl %%_partial_incomplete
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_enc_dec_done
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_enc_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn %%ENC_DEC, ENC
+ vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ lea rax, [rel byte_len_to_mask_table]
+ kmovw k1, [rax + r13*2]
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9
+ add %%DATA_OFFSET, r13
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA_KEY %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; Start AES for %%num_initial_blocks blocks
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+%if(%%num_initial_blocks>0)
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%endif ; %if(%%num_initial_blocks>0)
+
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; Write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if(%%num_initial_blocks>0)
+ vmovdqa %%T3, reg(i)
+%assign i (i+1)
+%endif
+%if %%num_initial_blocks>1
+%rep %%num_initial_blocks-1
+ vmovdqu [rsp + TMP %+ i], reg(i)
+%assign i (i+1)
+%endrep
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Prepare 8 counter blocks and perform rounds of AES cipher on
+ ;; them, load plain/cipher text and store cipher/plain text.
+ ;; Stitch GHASH computation in between AES rounds.
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
+ vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
+ vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
+ vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
+ vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
+ vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
+ vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
+ vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+
+%define %%T4_2 %%T4
+%if(%%num_initial_blocks>0)
+ ;; Hash in AES state
+ ;; T2 - incoming AAD hash
+ vpxor %%T2, %%T3
+
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*1]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*2]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>1)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*3]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*4]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>2)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>3)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*5]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*6]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>4)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*7]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*8]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>5)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*9]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%ifndef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>6)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+
+%ifdef GCM192_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>7)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM256_MODE ; GCM256
+ vmovdqu %%T_key, [%%GDATA_KEY+16*13]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*14]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif ; GCM256 mode
+
+%if(%%num_initial_blocks>0)
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; First phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ ;; First phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+ ;; Second phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; The result is in %%T3
+ vpxor %%T3, %%T1, %%T4
+%else
+ ;; The hash should end up in T3
+ vmovdqa %%T3, %%T2
+%endif
+
+ ;; Final hash is now in T3
+%if %%num_initial_blocks > 0
+ ;; NOTE: obsolete in case %%num_initial_blocks = 0
+ sub %%LENGTH, 16*%%num_initial_blocks
+%endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; If the last block is partial then the xor will be done later
+ ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
+ ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jl %%_initial_skip_last_word_write
+%endif
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ ;; Update %%LENGTH with the number of blocks processed
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+%%_initial_skip_last_word_write:
+ sub %%LENGTH, 128-16
+ add %%DATA_OFFSET, 128-16
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxor %%XMM1, %%XMM1, %%T3
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+;;; INITIAL_BLOCKS macro with support for a partial final block.
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 25
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
+%define %%T1 %8
+%define %%T2 %9
+%define %%T3 %10 ; [out] hash value
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21 ; [in] hash value
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+%define %%INSTANCE_TYPE %25
+
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ;; Compute AES counters
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ; Start AES for %%num_initial_blocks blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks-1
+ ;; Encrypt the message for all but the last block
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+%ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+%endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+%if %%num_initial_blocks > 1
+ ;; The final block of data may be <16B
+ sub %%LENGTH, 16*(%%num_initial_blocks-1)
+%endif
+
+%if %%num_initial_blocks < 8
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Encrypt the message
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+%ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+%endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+
+ ;; Hash all of the data
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%assign rep_count (%%num_initial_blocks-1)
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+ ;; Record that a reduction is needed
+ mov r12, 1
+
+ jmp %%_small_initial_compute_hash
+
+
+%endif ; %if %%num_initial_blocks < 8
+
+%%_small_initial_partial_block:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle ghash for a <16B final block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
+ ;; Handle a partial final block
+ ;; GDATA, KEY, T1, T2
+ ;; r13 - length
+ ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
+ ;; NOTE: could be replaced with %%LENGTH but at this point
+ ;; %%LENGTH is always less than 16.
+ ;; No PLAIN_CYPH_LEN argument available in this macro.
+ ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
+ vpshufb reg(i), [rel SHUF_MASK]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks-1)
+%assign last_block_to_hash 1
+%else
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Record that a reduction is needed
+ mov r12, 1
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ xor r12, r12
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign rep_count (%%num_initial_blocks-2)
+%%_multi_call_hash:
+%else
+%assign rep_count (%%num_initial_blocks-1)
+%endif
+
+%if rep_count < 0
+ ;; fix for negative rep_count
+%assign rep_count 0
+%endif
+
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+%%_small_initial_compute_hash:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Ghash reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+ ;; cmp r12, 0
+ or r12, r12
+ je %%_no_reduction_needed
+%endif
+%endif
+
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; First phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ ;; shift-L xmm2 2 DWs
+ vpslldq %%T2, %%T2, 8
+ vpxor %%T4, %%T4, %%T2
+
+ ;; First phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Second phase of the reduction
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+
+ vpxor %%T4, %%T4, %%T2
+ ;; Second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T3, %%T1, %%T4
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 8
+ ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ je %%_no_partial_block_xor
+%endif ; %%num_initial_blocks != 8
+ vpxor %%T3, %%T3, reg(8)
+%%_no_partial_block_xor:
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%_no_reduction_needed case only valid for
+ ;; multi_call with initial_blocks = 1.
+ ;; Look for comment above around '_no_reduction_needed'
+ ;; The jmp below is obsolete as the code will fall through.
+
+ ;; The result is in %%T3
+ jmp %%_after_reduction
+
+%%_no_reduction_needed:
+ ;; The hash should end up in T3. The only way we should get here is if
+ ;; there is a partial block of data, so xor that into the hash.
+ vpxor %%T3, %%T2, reg(8)
+%endif ; %%INSTANCE_TYPE = multi_call
+%endif ; %%num_initial_blocks=1
+
+%%_after_reduction:
+ ;; Final hash is now in T3
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+%define %%FULL_PARTIAL %23
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
+ vmovdqu %%T5, [rel TWO]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+
+ vmovdqu %%T5, [rel SHUF_MASK]
+ vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
+ vmovdqu %%T5, [rel TWOf]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ %ifndef GCM128_MODE ; GCM192 or GCM256
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif ; GCM256
+
+%assign i 0
+%assign j 1
+%rep 8
+
+ ;; SNP TBD: This is pretty ugly - consider whether just XORing the
+ ;; data in after vaesenclast is simpler and performant. Would
+ ;; also have to ripple it through partial block and ghash_mul_8.
+%ifidn %%FULL_PARTIAL, full
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+
+%else
+ ; Don't read the final data during partial block processing
+ %ifdef NT_LD
+ %if (i<7)
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %else
+ %if (i<7)
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ %if (i<7)
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ ;; Do not read the data since it could fault
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %else
+ vaesenclast reg(j), reg(j), %%T2
+ %endif
+ %endif
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ %ifidn %%ENC_DEC, ENC
+ ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
+ %ifidn %%FULL_PARTIAL, full
+ ;; Avoid writing past the buffer if handling a partial block
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
+ %endif
+ %endif
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_7 15
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_1]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+
+;;; Handle encryption of the final partial block
+;;; IN:
+;;; r13 - Number of bytes to read
+;;; MODIFIES:
+;;; KEY - Key for encrypting the partial block
+;;; SMASHES:
+;;; rax, T1
+;;; Note:
+;;; PLAIN_CYPH_LEN is unused at this stage. Previously:
+;;; it was used to determine if buffer is big enough to do
+;;; a 16 byte read & shift.
+;;; 'LT16' is passed here only if buffer is known to be smaller
+;;; than 16 bytes.
+;;; Any other value passed here will result in 16 byte read
+;;; code path.
+%macro ENCRYPT_FINAL_PARTIAL_BLOCK 7
+%define %%KEY %1
+%define %%T1 %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+
+ ;; %%PLAIN_CYPH_IN + %%DATA_OFFSET
+ ;; - input data address
+ ;; r13 - input data length
+ ;; rax - temp registers
+ ;; out:
+ ;; T1 - packed output
+ ;; k1 - valid byte mask
+ READ_SMALL_DATA_INPUT %%T1, %%PLAIN_CYPH_IN+%%DATA_OFFSET, r13, rax
+
+ ;; At this point T1 contains the partial block data
+ ;; Plaintext XOR E(K, Yn)
+ vpxorq %%KEY, %%KEY, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Output r13 Bytes
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY
+
+%ifidn %%ENC_DEC, DEC
+ ;; If decrypt, restore the ciphertext into %%KEY
+ vmovdqa64 %%KEY, %%T1
+%else
+ vmovdqu8 %%KEY{k1}{z}, %%KEY
+%endif
+%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
+
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 8
+%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX %2 ; [in] GCM context pointer
+%define %%IV %3 ; [in] IV pointer
+%define %%A_IN %4 ; [in] AAD pointer
+%define %%A_LEN %5 ; [in] AAD length in bytes
+%define %%GPR1 %6 ; temp GPR
+%define %%GPR2 %7 ; temp GPR
+%define %%GPR3 %8 ; temp GPR
+
+%define %%AAD_HASH xmm14
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3
+
+ mov %%GPR1, %%A_LEN
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length
+
+ xor %%GPR1, %%GPR1
+ mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0
+
+ ;; read 12 IV bytes and pad with 0x00000001
+ mov %%GPR2, %%IV
+ vmovd xmm3, [%%GPR2 + 8]
+ vpslldq xmm3, 8
+ vmovq xmm2, [%%GPR2]
+ vmovdqa xmm4, [rel ONEf]
+ vpternlogq xmm2, xmm3, xmm4, 0xfe ; xmm2 = xmm2 or xmm3 or xmm4
+
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ ;; store IV as counter in LE format
+ vpshufb xmm2, [rel SHUF_MASK]
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+%macro GCM_ENC_DEC_SMALL 12
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+%define %%LENGTH %8 ; assumed r13
+%define %%NUM_BLOCKS %9
+%define %%CTR %10 ; assumed xmm9
+%define %%HASH_OUT %11 ; assumed xmm14
+%define %%INSTANCE_TYPE %12
+
+ ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
+ ;; cmp %%NUM_BLOCKS, 0
+ ;; je %%_small_initial_blocks_encrypted
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_8:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%INSTANCE_TYPE %7
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%PLAIN_CYPH_LEN, 0
+%else
+ or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], rax
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+%endif
+
+ ;; lift CTR set from initial_blocks to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu xmm9, xmm2
+%else
+ vmovdqu xmm9, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in r10
+ mov r13, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ sub r13, %%DATA_OFFSET
+
+ ;; There may be no more data if it was consumed in the partial block.
+ cmp r13, 0
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+ mov r10, r13
+
+ ;; Determine how many blocks to process in INITIAL
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ and r10, 0xf
+ blsmsk r10, r10 ; Set CF if zero
+ cmc ; Flip CF
+ adc r12, 0x0 ; Process an additional INITIAL block if CF set
+
+ ;; Less than 127B will be handled by the small message code, which
+ ;; can process up to 7 16B blocks.
+ cmp r13, 128
+ jge %%_large_message_path
+
+ GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
+ ; can be handled by the x8 partial loop.
+
+ cmp r12, 0
+ je %%_initial_num_blocks_is_0
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ ;; The entire message was encrypted processed in initial and now need to be hashed
+ cmp r13, 0
+ je %%_encrypt_done
+
+ ;; Encrypt the final <16 byte (partial) block, then hash
+ cmp r13, 16
+ jl %%_encrypt_final_partial
+
+ ;; Process 7 full blocks plus a partial block
+ cmp r13, 128
+ jl %%_encrypt_by_8_partial
+
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter without shuffling
+ ;; it back into little endian. r15d keeps track of when we need to increent in order so
+ ;; that the carry is handled correctly.
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ ;; xmm0 - T1
+ ;; xmm10 - T2
+ ;; xmm11 - T3
+ ;; xmm12 - T4
+ ;; xmm13 - T5
+ ;; xmm14 - T6
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8
+ ;; xmm15 - T7
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+
+ vpshufb xmm9, [rel SHUF_MASK]
+ jmp %%_encrypt_by_8_parallel_done
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [rel SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
+ vpshufb xmm9, [rel SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_parallel_done:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ cmp r13, 0
+ je %%_encrypt_done
+
+%%_encrypt_by_8_partial:
+ ;; Shuffle needed to align key for partial block xor. out_order
+ ;; is a little faster because it avoids extra shuffles.
+ ;; TBD: Might need to account for when we don't have room to increment the counter.
+
+
+ ;; Process parallel buffers with a final partial block.
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
+
+
+ add %%DATA_OFFSET, 128-16
+ sub r13, 128-16
+
+%%_encrypt_final_partial:
+
+ vpshufb xmm8, [rel SHUF_MASK]
+ mov [%%GDATA_CTX + PBlockLen], r13
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
+
+ ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
+ ;; GDATA, KEY, T1, T2
+ ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
+
+ vpshufb xmm8, [rel SHUF_MASK]
+
+
+%%_encrypt_done:
+
+ ;; Mapping to macro parameters
+ ;; IN:
+ ;; xmm9 contains the counter
+ ;; xmm1-xmm8 contain the xor'd ciphertext
+ ;; OUT:
+ ;; xmm14 contains the final hash
+ ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ jz %%_hash_last_8
+ GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ ;; XOR the partial word into the hash
+ vpxor xmm14, xmm14, xmm8
+ jmp %%_ghash_done
+%endif
+%%_hash_last_8:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+%%_ghash_done:
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+
+%%_enc_dec_done:
+
+
+%endmacro ; GCM_ENC_DEC
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx512 /
+; aes_gcm_precomp_192_avx512 /
+; aes_gcm_precomp_256_avx512
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+exit_precomp:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+exit_init:
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 /
+; aes_gcm_enc_256_update_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 /
+; aes_gcm_dec_256_update_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 /
+; aes_gcm_enc_256_finalize_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+exit_enc_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512
+; aes_gcm_dec_256_finalize_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+exit_dec_fin:
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm
new file mode 100644
index 000000000..4ef183d31
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm
@@ -0,0 +1,4272 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+; Tomasz Kantecki
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_vaes_avx512.asm"
+%include "include/memcpy.asm"
+%include "include/aes_common.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512
+%endif
+
+section .text
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE (10*16) ; space for 10 XMM registers
+ %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment
+%else
+ %define XMM_STORAGE 0
+ %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment
+%endif
+%ifdef GCM_BIG_DATA
+%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks
+%else
+%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks
+%endif
+
+;;; sequence is (bottom-up): GP, XMM, local
+%define STACK_GP_OFFSET 0
+%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE)
+%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE)
+%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE)
+
+;; for compatibility with stack argument definitions in gcm_defines.asm
+%define STACK_OFFSET 0
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 4 x 128bits xored together
+%macro VHPXORI4x128 2
+%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] ZMM temporary register
+ vextracti64x4 YWORD(%%TMP), %%REG, 1
+ vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
+ vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI4x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 2 x 128bits xored together
+%macro VHPXORI2x128 2
+%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register
+ vextracti32x4 XWORD(%%TMP), %%REG, 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI2x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 1st step
+%macro VCLMUL_STEP1 6-7
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] previous blocks 4 to 7
+%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %4 ; [out] high product
+%define %%TM %5 ; [out] medium product
+%define %%TL %6 ; [out] low product
+%define %%HKEY %7 ; [in/optional] hash key for multiplication
+
+%if %0 == 6
+ vmovdqu64 %%TMP, [%%KP + HashKey_4]
+%else
+ vmovdqa64 %%TMP, %%HKEY
+%endif
+ vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1
+ vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1
+ vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL]
+%endmacro ; VCLMUL_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 2nd step
+%macro VCLMUL_STEP2 9-11
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ghash high 128 bits
+%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %7 ; [in] high product
+%define %%TM %8 ; [in] medium product
+%define %%TL %9 ; [in] low product
+%define %%HKEY %10 ; [in/optional] hash key for multiplication
+%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none)
+
+%if %0 == 9
+ vmovdqu64 %%TMP0, [%%KP + HashKey_8]
+%else
+ vmovdqa64 %%TMP0, %%HKEY
+%endif
+ vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1
+ vpxorq %%TH, %%TH, %%TMP2
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0
+ vpxorq %%TL, %%TL, %%TMP2
+ vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0
+ vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0
+
+ ;; finish multiplications
+ vpsrldq %%TMP2, %%TM, 8
+ vpxorq %%HI, %%TH, %%TMP2
+ vpslldq %%TMP2, %%TM, 8
+ vpxorq %%LO, %%TL, %%TMP2
+
+ ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
+ ;; note: (X1+Y0) handled elsewhere
+%if %0 < 11
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%else
+%if %%HXOR == 4
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%elif %%HXOR == 2
+ VHPXORI2x128 %%HI, %%TMP2
+ VHPXORI2x128 %%LO, %%TMP1
+%endif ; HXOR
+ ;; for HXOR == 1 there is nothing to be done
+%endif ; !(%0 < 11)
+ ;; HIx holds top 128 bits
+ ;; LOx holds low 128 bits
+ ;; - further reductions to follow
+%endmacro ; VCLMUL_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; AVX512 reduction macro
+%macro VCLMUL_REDUCE 6
+%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128)
+%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial
+%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce
+%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce
+%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register
+%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; first phase of the reduction
+ vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01
+ vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs
+ vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; second phase of the reduction
+ vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00
+ vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10
+ vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts
+
+ vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 1st step
+%macro VCLMUL_1_TO_8_STEP1 8
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7
+%define %%TMP1 %3 ; [clobbered] ZMM temporary
+%define %%TMP2 %4 ; [clobbered] ZMM temporary
+%define %%TH %5 ; [out] ZMM high product
+%define %%TM %6 ; [out] ZMM medium product
+%define %%TL %7 ; [out] ZMM low product
+%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%HI, %%TMP1
+ VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2
+%elif %%NBLOCKS == 6
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2)
+%elif %%NBLOCKS == 5
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2)
+%else
+ vpxorq %%TH, %%TH
+ vpxorq %%TM, %%TM
+ vpxorq %%TL, %%TL
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 2nd step
+%macro VCLMUL_1_TO_8_STEP2 10
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ZMM ghash high 128bits
+%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM temporary
+%define %%TH %7 ; [in/clobbered] ZMM high sum
+%define %%TM %8 ; [in/clobbered] ZMM medium sum
+%define %%TL %9 ; [in/clobbered] ZMM low sum
+%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_7]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 6
+ vmovdqu64 %%TMP2, [%%KP + HashKey_6]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 5
+ vmovdqu64 %%TMP2, [%%KP + HashKey_5]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 4
+ vmovdqu64 %%TMP2, [%%KP + HashKey_4]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 3
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%LO, %%TMP1
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 2
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \
+ YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2
+%elif %%NBLOCKS == 1
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \
+ XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1
+%else
+ vpxorq %%HI, %%HI
+ vpxorq %%LO, %%LO
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH 1 to 16 blocks of cipher text
+;;; - performs reduction at the end
+;;; - can take intermediate GHASH sums as input
+%macro GHASH_1_TO_16 20
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm"
+%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm"
+%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm"
+%define %%AAD_HASH_IN %15 ; [in] input hash value
+%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3
+%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7
+%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11
+%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15
+%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%HK %%T9
+
+%assign hashk HashKey_ %+ %%NUM_BLOCKS
+%assign reg_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+ vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN
+
+%assign first_result 1
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GM, no_zmm
+%ifnidn %%GL, no_zmm
+ ;; GHASH sums passed in to be updated and
+ ;; reduced at the end
+ vmovdqa64 %%T0H, %%GH
+ vmovdqa64 %%T0L, %%GL
+ vmovdqa64 %%T0M1, %%GM
+ vpxorq %%T0M2, %%T0M2
+%assign first_result 0
+%endif
+%endif
+%endif
+
+%rep (blocks_left / 4)
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+ vmovdqu64 %%HK, [%%KP + hashk]
+%if first_result == 1
+ vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1
+%assign first_result 0
+%else
+ vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+%undef %%REG_IN
+%assign reg_idx (reg_idx + 1)
+%assign hashk (hashk + 64)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%if blocks_left > 0
+;; There are 1, 2 or 3 blocks left to process.
+;; It may also be that they are the only blocks to process.
+
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+
+%if first_result == 1
+;; Case where %%NUM_BLOCKS = 1, 2 or 3
+%xdefine %%OUT_H %%T0H
+%xdefine %%OUT_L %%T0L
+%xdefine %%OUT_M1 %%T0M1
+%xdefine %%OUT_M2 %%T0M2
+%else
+%xdefine %%OUT_H %%T1H
+%xdefine %%OUT_L %%T1L
+%xdefine %%OUT_M1 %%T1M1
+%xdefine %%OUT_M2 %%T1M2
+%endif
+
+%if blocks_left == 1
+ vmovdqu64 XWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%elif blocks_left == 2
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%else ; blocks_left == 3
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vinserti64x2 %%HK, [%%KP + hashk + 32], 2
+ vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1
+%endif ; blocks_left
+
+%undef %%REG_IN
+%undef %%OUT_H
+%undef %%OUT_L
+%undef %%OUT_M1
+%undef %%OUT_M2
+
+%if first_result != 1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+
+%endif ; blocks_left > 0
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T1M1
+ VHPXORI4x128 %%T0L, %%T1M2
+
+ ;; reduction
+ vmovdqa64 XWORD(%%HK), [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \
+ XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2)
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+;;; Input: A and B (128-bits each, bit-reflected)
+;;; Output: C = A*B*x mod poly, (i.e. >>1 )
+;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxorq %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxorq %%T1, %%T1, %%T3
+ vpxorq %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu64 %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts
+
+ ; second phase of the reduction complete, the result is in %%GH
+ vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+;;; functions, but are kept to allow users to switch cpu architectures between calls
+;;; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ vmovdqa %%T5, %%HK
+
+ ;; GHASH keys 2 to 48 or 128
+%ifdef GCM_BIG_DATA
+%assign max_hkey_idx 128
+%else
+%assign max_hkey_idx 48
+%endif
+
+%assign i 2
+%rep (max_hkey_idx - 1)
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; READ_SMALL_DATA_INPUT
+;;; Packs xmm register with data when data input is less or equal to 16 bytes
+;;; Returns 0 if data has length 0
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 5
+%define %%OUTPUT %1 ; [out] xmm register
+%define %%INPUT %2 ; [in] buffer pointer to read from
+%define %%LENGTH %3 ; [in] number of bytes to read
+%define %%TMP1 %4 ; [clobbered]
+%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask
+
+ cmp %%LENGTH, 16
+ jge %%_read_small_data_ge16
+ lea %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+ add %%TMP1, %%LENGTH
+ add %%TMP1, %%LENGTH
+ kmovw %%MASK, [%%TMP1]
+%else
+ kmovw %%MASK, [%%TMP1 + %%LENGTH*2]
+%endif
+ vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT]
+ jmp %%_read_small_data_end
+%%_read_small_data_ge16:
+ VX512LDR %%OUTPUT, [%%INPUT]
+ mov %%TMP1, 0xffff
+ kmovq %%MASK, %%TMP1
+%%_read_small_data_end:
+%endmacro ; READ_SMALL_DATA_INPUT
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 18
+%define %%A_IN %1 ; [in] AAD text pointer
+%define %%A_LEN %2 ; [in] AAD length
+%define %%AAD_HASH %3 ; [out] xmm ghash value
+%define %%GDATA_KEY %4 ; [in] pointer to keys
+%define %%ZT0 %5 ; [clobbered] ZMM register
+%define %%ZT1 %6 ; [clobbered] ZMM register
+%define %%ZT2 %7 ; [clobbered] ZMM register
+%define %%ZT3 %8 ; [clobbered] ZMM register
+%define %%ZT4 %9 ; [clobbered] ZMM register
+%define %%ZT5 %10 ; [clobbered] ZMM register
+%define %%ZT6 %11 ; [clobbered] ZMM register
+%define %%ZT7 %12 ; [clobbered] ZMM register
+%define %%ZT8 %13 ; [clobbered] ZMM register
+%define %%ZT9 %14 ; [clobbered] ZMM register
+%define %%T1 %15 ; [clobbered] GP register
+%define %%T2 %16 ; [clobbered] GP register
+%define %%T3 %17 ; [clobbered] GP register
+%define %%MASKREG %18 ; [clobbered] mask register
+
+%define %%SHFMSK %%ZT9
+%define %%POLY %%ZT8
+%define %%TH %%ZT7
+%define %%TM %%ZT6
+%define %%TL %%ZT5
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxorq %%AAD_HASH, %%AAD_HASH
+
+ vmovdqa64 %%SHFMSK, [rel SHUF_MASK]
+ vmovdqa64 %%POLY, [rel POLY2]
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3)
+ vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7)
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+
+ VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL
+ VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL
+
+ ;; result in %%ZT1(H):%%ZT2(L)
+ ;; reduce and put the result in AAD_HASH
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ or %%T2, %%T2
+ jz %%_CALC_AAD_done
+
+ ;; prep mask source address
+ lea %%T3, [rel byte64_len_to_mask_table]
+ lea %%T3, [%%T3 + %%T2*8]
+
+ ;; calculate number of blocks to ghash (including partial bytes)
+ add %%T2, 15
+ and %%T2, -16 ; 1 to 8 blocks possible here
+ shr %%T2, 4
+ cmp %%T2, 7
+ je %%_AAD_blocks_7
+ cmp %%T2, 6
+ je %%_AAD_blocks_6
+ cmp %%T2, 5
+ je %%_AAD_blocks_5
+ cmp %%T2, 4
+ je %%_AAD_blocks_4
+ cmp %%T2, 3
+ je %%_AAD_blocks_3
+ cmp %%T2, 2
+ je %%_AAD_blocks_2
+ cmp %%T2, 1
+ je %%_AAD_blocks_1
+ ;; fall through for 8 blocks
+
+ ;; The flow of each of these cases is identical:
+ ;; - load blocks plain text
+ ;; - shuffle loaded blocks
+ ;; - xor in current hash value into block 0
+ ;; - perform up multiplications with ghash keys
+ ;; - jump to reduction code
+%%_AAD_blocks_8:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 8
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_7:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 7
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_6:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 6
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_5:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 5
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_4:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 4
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_3:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 3
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_2:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 2
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_1:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 1
+
+%%_AAD_blocks_done:
+ ;; Multiplications have been done. Do the reduction now
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+%%_CALC_AAD_done:
+ ;; result in AAD_HASH
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; PARTIAL_BLOCK
+;;; Handles encryption/decryption and the tag partial blocks between
+;;; update calls.
+;;; Requires the input data be at least 1 byte long.
+;;; Output:
+;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT),
+;;; AAD_HASH and updated GDATA_CTX
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 22
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated)
+%define %%AAD_HASH %7 ; [out] updated GHASH value
+%define %%ENC_DEC %8 ; [in] cipher direction
+%define %%GPTMP0 %9 ; [clobbered] GP temporary register
+%define %%GPTMP1 %10 ; [clobbered] GP temporary register
+%define %%GPTMP2 %11 ; [clobbered] GP temporary register
+%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register
+%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register
+%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register
+%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register
+%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register
+%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register
+%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register
+%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register
+%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register
+%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register
+%define %%MASKREG %22 ; [clobbered] mask temporary register
+
+%define %%XTMP0 XWORD(%%ZTMP0)
+%define %%XTMP1 XWORD(%%ZTMP1)
+%define %%XTMP2 XWORD(%%ZTMP2)
+%define %%XTMP3 XWORD(%%ZTMP3)
+%define %%XTMP4 XWORD(%%ZTMP4)
+%define %%XTMP5 XWORD(%%ZTMP5)
+%define %%XTMP6 XWORD(%%ZTMP6)
+%define %%XTMP7 XWORD(%%ZTMP7)
+%define %%XTMP8 XWORD(%%ZTMP8)
+%define %%XTMP9 XWORD(%%ZTMP9)
+
+%define %%LENGTH %%GPTMP0
+%define %%IA0 %%GPTMP1
+%define %%IA1 %%GPTMP2
+
+ mov %%LENGTH, [%%GDATA_CTX + PBlockLen]
+ or %%LENGTH, %%LENGTH
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG
+
+ ;; XTMP1 = my_ctx_data.partial_block_enc_key
+ vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey]
+ vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey]
+
+ ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes
+ ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16)
+ lea %%IA0, [rel SHIFT_MASK]
+ add %%IA0, %%LENGTH
+ vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask
+ vpshufb %%XTMP1, %%XTMP3
+
+%ifidn %%ENC_DEC, DEC
+ ;; keep copy of cipher text in %%XTMP4
+ vmovdqa64 %%XTMP4, %%XTMP0
+%endif
+ vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn)
+
+ ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ ;; Determine if partial block is not being filled and shift mask accordingly
+ mov %%IA1, %%PLAIN_CYPH_LEN
+ add %%IA1, %%LENGTH
+ sub %%IA1, 16
+ jge %%_no_extra_mask
+ sub %%IA0, %%IA1
+%%_no_extra_mask:
+ ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1
+ ;; - mask out bottom %%LENGTH bytes of %%XTMP1
+ vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK]
+ vpand %%XTMP1, %%XTMP0
+
+%ifidn %%ENC_DEC, DEC
+ vpand %%XTMP4, %%XTMP0
+ vpshufb %%XTMP4, [rel SHUF_MASK]
+ vpshufb %%XTMP4, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP4
+%else
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP1
+%endif
+ cmp %%IA1, 0
+ jl %%_partial_incomplete
+
+ ;; GHASH computation for the last <16 Byte block
+ GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9
+
+ mov qword [%%GDATA_CTX + PBlockLen], 0
+
+ ;; Set %%IA1 to be the number of bytes to write out
+ mov %%IA0, %%LENGTH
+ mov %%LENGTH, 16
+ sub %%LENGTH, %%IA0
+ jmp %%_enc_dec_done
+
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], %%IA0
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+
+%%_enc_dec_done:
+ ;; output encrypted Bytes
+
+ lea %%IA0, [rel byte_len_to_mask_table]
+ kmovw %%MASKREG, [%%IA0 + %%LENGTH*2]
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn %%ENC_DEC, ENC
+ ;; shuffle XTMP1 back to output as ciphertext
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+%endif
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1
+ add %%DATA_OFFSET, %%LENGTH
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL
+;;; macro code. It is called only for data lenghts 128 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 8 blocks and stitch with
+;;; GHASH for the first %%num_initial_blocks
+;;; - the last 8th block can be partial (lengths between 129 and 239)
+;;; - partial block ciphering is handled within this macro
+;;; - top bytes of such block are cleared for
+;;; the subsequent GHASH calculations
+;;; - PBlockEncKey needs to be setup in case of multi-call
+;;; - top bytes of the block need to include encrypted counter block so that
+;;; when handling partial block case text is read and XOR'ed against it.
+;;; This needs to be in un-shuffled format.
+
+%macro INITIAL_BLOCKS 26-27
+%define %%GDATA_KEY %1 ; [in] pointer to GCM keys
+%define %%GDATA_CTX %2 ; [in] pointer to GCM context
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%LENGTH %5 ; [in/out] number of bytes to process
+%define %%DATA_OFFSET %6 ; [in/out] data offset
+%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%CTR %8 ; [in/out] XMM counter block
+%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash
+%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH
+%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH
+%define %%ZT3 %12 ; [clobbered] ZMM temporary
+%define %%ZT4 %13 ; [clobbered] ZMM temporary
+%define %%ZT5 %14 ; [clobbered] ZMM temporary
+%define %%ZT6 %15 ; [clobbered] ZMM temporary
+%define %%ZT7 %16 ; [clobbered] ZMM temporary
+%define %%ZT8 %17 ; [clobbered] ZMM temporary
+%define %%ZT9 %18 ; [clobbered] ZMM temporary
+%define %%ZT10 %19 ; [clobbered] ZMM temporary
+%define %%ZT11 %20 ; [clobbered] ZMM temporary
+%define %%ZT12 %21 ; [clobbered] ZMM temporary
+%define %%IA0 %22 ; [clobbered] GP temporary
+%define %%IA1 %23 ; [clobbered] GP temporary
+%define %%ENC_DEC %24 ; [in] ENC/DEC selector
+%define %%MASKREG %25 ; [clobbered] mask register
+%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask
+%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes)
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+%define %%T9 XWORD(%%ZT9)
+
+%define %%TH %%ZT10
+%define %%TM %%ZT11
+%define %%TL %%ZT12
+
+;; determine if partial block code needs to be added
+%assign partial_block_possible 1
+%if %0 > 26
+%ifidn %%PARTIAL_PRESENT, no_partial_block
+%assign partial_block_possible 0
+%endif
+%endif
+
+%if %%num_initial_blocks > 0
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd %%T3, %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+
+ ;; extract new counter value (%%T3)
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1)
+%else
+ vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT1, j, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; adjust data offset and length
+ sub %%LENGTH, (%%num_initial_blocks * 16)
+ add %%DATA_OFFSET, (%%num_initial_blocks * 16)
+
+ ;; At this stage
+ ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed
+
+%endif ; %%num_initial_blocks > 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; - cipher of %%num_initial_blocks is done
+ ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4)
+ ;; - save the last block in %%CTR
+ ;; - shuffle the blocks for AES
+ ;; - stitch encryption of the new blocks with
+ ;; GHASHING the previous blocks
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+ vextracti32x4 %%CTR, %%ZT4, 3
+
+ vpshufb %%ZT3, %%SHUFMASK
+ vpshufb %%ZT4, %%SHUFMASK
+
+%if partial_block_possible != 0
+ ;; get text load/store mask (assume full mask by default)
+ mov %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; We know there is partial block if:
+ ;; LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jge %%_initial_partial_block_continue
+ mov %%IA1, rcx
+ mov rcx, 128
+ sub rcx, %%LENGTH
+ shr %%IA0, cl
+ mov rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+ kmovq %%MASKREG, %%IA0
+ ;; load plain or cipher text (masked)
+ ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG
+%else
+ ;; load plain or cipher text
+ ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm
+%endif ;; partial_block_possible
+
+ ;; === AES ROUND 0
+%assign aes_round 0
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;; === GHASH blocks 4-7
+%if (%%num_initial_blocks > 0)
+ ;; Hash in AES state
+ vpxorq %%ZT5, %%ZT5, %%AAD_HASH
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [1/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH blocks 0-3 and gather
+%if (%%num_initial_blocks > 0)
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \
+ %%ZT7, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [2/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH reduction
+
+%if (%%num_initial_blocks > 0)
+ ;; [out] AAD_HASH - hash output
+ ;; [in] T8 - polynomial
+ ;; [in] T6 - high, T5 - low
+ ;; [clobbered] T9, T7 - temporary
+ vmovdqu64 %%T8, [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9
+%endif
+
+ ;; === [3/3] of AES rounds
+
+%rep (((NROUNDS + 1) / 3) + 2)
+%if aes_round < (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+%if partial_block_possible != 0
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG
+ ;; check if there is partial block
+ cmp %%LENGTH, 128
+ jl %%_initial_save_partial
+ ;; adjust offset and length
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jmp %%_initial_blocks_done
+%%_initial_save_partial:
+ ;; partial block case
+ ;; - save the partial block in unshuffled format
+ ;; - ZT4 is partially XOR'ed with data and top bytes contain
+ ;; encrypted counter block only
+ ;; - save number of bytes process in the partial block
+ ;; - adjust offset and zero the length
+ ;; - clear top bytes of the partial block for subsequent GHASH calculations
+ vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3
+ add %%DATA_OFFSET, %%LENGTH
+ sub %%LENGTH, (128 - 16)
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ xor %%LENGTH, %%LENGTH
+ vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4
+%%_initial_blocks_done:
+%else
+ ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+%endif ;; partial_block_possible
+
+ ;; Shuffle AES result for GHASH.
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT1 & ZT2
+ vpshufb %%ZT1, %%SHUFMASK
+ vpshufb %%ZT2, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ vpshufb %%ZT1, %%ZT3, %%SHUFMASK
+ vpshufb %%ZT2, %%ZT4, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Current hash value is in AAD_HASH
+
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxorq %%ZT1, %%ZT1, %%AAD_HASH
+
+%endmacro ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - first encrypts/decrypts required number of blocks and then
+;;; ghashes these blocks
+;;; - Small packets or left over data chunks (<256 bytes)
+;;; - single or multi call
+;;; - Remaining data chunks below 256 bytes (multi buffer code)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 41
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer
+%define %%LENGTH %5 ; [in/clobbered] length in bytes
+%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated)
+%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
+%define %%CTR %8 ; [in/out] current counter value
+%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value
+%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC)
+%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call
+%define %%ZT0 %12 ; [clobbered] ZMM temporary
+%define %%ZT1 %13 ; [clobbered] ZMM temporary
+%define %%ZT2 %14 ; [clobbered] ZMM temporary
+%define %%ZT3 %15 ; [clobbered] ZMM temporary
+%define %%ZT4 %16 ; [clobbered] ZMM temporary
+%define %%ZT5 %17 ; [clobbered] ZMM temporary
+%define %%ZT6 %18 ; [clobbered] ZMM temporary
+%define %%ZT7 %19 ; [clobbered] ZMM temporary
+%define %%ZT8 %20 ; [clobbered] ZMM temporary
+%define %%ZT9 %21 ; [clobbered] ZMM temporary
+%define %%ZT10 %22 ; [clobbered] ZMM temporary
+%define %%ZT11 %23 ; [clobbered] ZMM temporary
+%define %%ZT12 %24 ; [clobbered] ZMM temporary
+%define %%ZT13 %25 ; [clobbered] ZMM temporary
+%define %%ZT14 %26 ; [clobbered] ZMM temporary
+%define %%ZT15 %27 ; [clobbered] ZMM temporary
+%define %%ZT16 %28 ; [clobbered] ZMM temporary
+%define %%ZT17 %29 ; [clobbered] ZMM temporary
+%define %%ZT18 %30 ; [clobbered] ZMM temporary
+%define %%ZT19 %31 ; [clobbered] ZMM temporary
+%define %%ZT20 %32 ; [clobbered] ZMM temporary
+%define %%ZT21 %33 ; [clobbered] ZMM temporary
+%define %%ZT22 %34 ; [clobbered] ZMM temporary
+%define %%GH %35 ; [in] ZMM ghash sum (high)
+%define %%GL %36 ; [in] ZMM ghash sum (low)
+%define %%GM %37 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %38 ; [clobbered] GP temporary
+%define %%IA1 %39 ; [clobbered] GP temporary
+%define %%MASKREG %40 ; [clobbered] mask register
+%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T7 XWORD(%%ZT7)
+
+%define %%CTR0 %%ZT3
+%define %%CTR1 %%ZT4
+%define %%CTR2 %%ZT8
+%define %%CTR3 %%ZT9
+
+%define %%DAT0 %%ZT5
+%define %%DAT1 %%ZT6
+%define %%DAT2 %%ZT10
+%define %%DAT3 %%ZT11
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GL, no_zmm
+%ifnidn %%GM, no_zmm
+ ;; when temporary sums are passed then zero HASH IN value
+ ;; - whatever it holds it is invalid in this case
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT
+%endif
+%endif
+%endif
+ ;; Copy ghash to temp reg
+ vmovdqa64 %%T2, %%HASH_IN_OUT
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd XWORD(%%CTR0), %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234]
+%if %%num_initial_blocks > 4
+ vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+%if %%num_initial_blocks > 8
+ vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888]
+%endif
+%if %%num_initial_blocks > 12
+ vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888]
+%endif
+%endif
+
+ ;; get load/store mask
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+ sub %%IA1, 3 * 64
+%elif %%num_initial_blocks > 8
+ sub %%IA1, 2 * 64
+%elif %%num_initial_blocks > 4
+ sub %%IA1, 64
+%endif
+ kmovq %%MASKREG, [%%IA0 + %%IA1*8]
+
+ ;; extract new counter value
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%ZT1, j, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; retrieve the last cipher counter block (partially XOR'ed with text)
+ ;; - this is needed for partial block cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+
+ ;; write cipher/plain text back to output and
+ ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG
+
+ ;; zero bytes outside the mask before hashing
+%if %%num_initial_blocks <= 4
+ vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0
+%elif %%num_initial_blocks <= 8
+ vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1
+%elif %%num_initial_blocks <= 12
+ vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2
+%else
+ vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3
+%endif
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in CTR0-CTR3
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Extract the last block for partials and multi_call cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9
+%else
+ vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; update data offset
+%if %%num_initial_blocks > 1
+ ;; The final block of data may be <16B
+ add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1)
+ sub %%LENGTH, 16 * (%%num_initial_blocks - 1)
+%endif
+
+%if %%num_initial_blocks < 16
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Hash all of the data
+
+ ;; ZT2 - incoming AAD hash (low 128bits)
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks
+
+ jmp %%_small_initial_compute_done
+%endif ; %if %%num_initial_blocks < 16
+
+%%_small_initial_partial_block:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;; Handle ghash for a <16B final block
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ ;; %%T1 is ciphered counter block
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign k (%%num_initial_blocks - 1)
+%assign last_block_to_hash 1
+%else
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if (%%num_initial_blocks > last_block_to_hash)
+
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k
+
+ ;; just fall through no jmp needed
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+
+%assign need_for_reduction 1
+%ifidn %%GH, no_zmm
+%ifidn %%GL, no_zmm
+%ifidn %%GM, no_zmm
+;; if %%GH, %%GL & %%GM not passed then reduction is not required
+%assign need_for_reduction 0
+%endif
+%endif
+%endif
+
+%if need_for_reduction == 0
+ ;; The hash should end up in HASH_IN_OUT.
+ ;; The only way we should get here is if there is
+ ;; a partial block of data, so xor that into the hash.
+ vpxorq %%HASH_IN_OUT, %%T2, %%T7
+%else
+ ;; right - here we have nothing to ghash in the small data but
+ ;; we have GHASH sums passed through that we need to gather and reduce
+
+ ;; integrate TM into TH and TL
+ vpsrldq %%ZT12, %%GM, 8
+ vpslldq %%ZT13, %%GM, 8
+ vpxorq %%GH, %%GH, %%ZT12
+ vpxorq %%GL, %%GL, %%ZT13
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%GH, %%ZT12
+ VHPXORI4x128 %%GL, %%ZT13
+
+ ;; reduction
+ vmovdqa64 XWORD(%%ZT12), [rel POLY2]
+ VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \
+ XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14)
+
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif
+ ;; The result is in %%HASH_IN_OUT
+ jmp %%_after_reduction
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; After GHASH reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_small_initial_compute_done:
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 16
+ ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero
+ or %%LENGTH, %%LENGTH
+ je %%_after_reduction
+%endif ; %%num_initial_blocks != 16
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%%_after_reduction:
+ ;; Final hash is now in HASH_IN_OUT
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 8 blocks at a time
+;;; - ghash the 8 previously encrypted ciphertext blocks
+;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output
+;;; contains encrypted counter block.
+%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3
+%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7
+%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3
+%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7
+%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES)
+%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add"
+%define %%ENC_DEC %11 ; [in] cipher direction
+%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial"
+%define %%IA0 %13 ; [clobbered] temporary GP register
+%define %%IA1 %14 ; [clobbered] temporary GP register
+%define %%LENGTH %15 ; [in] length
+%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection
+%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1
+%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5
+%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash)
+%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores
+%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction"
+%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction"
+
+%define %%GH1H %%ZT10
+%define %%GH1L %%ZT11
+%define %%GH1M1 %%ZT12
+%define %%GH1M2 %%ZT13
+
+%define %%GH2H %%ZT14
+%define %%GH2L %%ZT15
+%define %%GH2M1 %%ZT16
+%define %%GH2M2 %%ZT17
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; populate counter blocks for cipher part
+%ifidn %%loop_idx, in_order
+ ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro
+ ;; it has to be kept in unshuffled format
+ vpshufb %%ZT1, %%CTR1, %%SHFMSK
+ vpshufb %%ZT2, %%CTR2, %%SHFMSK
+%else
+ vmovdqa64 %%ZT1, %%CTR1
+ vmovdqa64 %%ZT2, %%CTR2
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+%assign aes_round 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;;==================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1
+ vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpxorq %%GH1M1, %%GH1M1, %%GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 2 x AES ROUND
+
+ ;; =================================================
+ ;; Add mid product to high and low then
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+%endif
+%ifnidn %%DO_REDUCTION, no_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; first phase of reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vmovdqu64 XWORD(%%GH2M2), [rel POLY2]
+ vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00
+ vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10
+ vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts
+ ;; ZT13 = ZT13 xor ZT16 xor GH1H
+ vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; all remaining AES rounds but the last
+%rep (NROUNDS + 2)
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+ VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ VX512LDR %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%else
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+ sub %%IA1, 64
+ kmovq %%MASKREG, [%%IA0 + 8*%%IA1]
+ VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store the cipher/plain text data
+%ifidn %%FULL_PARTIAL, full
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+%else
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2
+%endif
+
+ ;; =================================================
+ ;; prep cipher text blocks for the next ghash round
+
+%ifnidn %%FULL_PARTIAL, full
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; for partial block & multi_call we need encrypted counter block
+ vpxorq %%ZT3, %%ZT2, %%ZT5
+ vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3
+%endif
+ ;; for GHASH computation purpose clear the top bytes of the partial block
+%ifidn %%ENC_DEC, ENC
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2
+%else
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5
+%endif
+%endif ; %ifnidn %%FULL_PARTIAL, full
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK
+%else
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK
+%endif
+
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; =================================================
+ ;; XOR current GHASH value (ZT13) into block 0
+ vpxorq %%GHASHIN_AESOUT_B03, %%ZT13
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value (ZT13) in TO_REDUCE_L
+ vmovdqa64 %%TO_REDUCE_L, %%ZT13
+%endif
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+;;; - ghash the 16 previously encrypted ciphertext blocks
+;;; - no partial block or multi_call handling here
+%macro GHASH_16_ENCRYPT_16_PARALLEL 42
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian
+%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check
+%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key
+%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out
+%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in
+%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
+%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT19 %29 ; [clobbered] temporary ZMM
+%define %%ZT20 %30 ; [clobbered] temporary ZMM
+%define %%ZT21 %31 ; [clobbered] temporary ZMM
+%define %%ZT22 %32 ; [clobbered] temporary ZMM
+%define %%ZT23 %33 ; [clobbered] temporary ZMM
+%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian
+%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
+%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum
+%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum
+%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum
+%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time"
+%define %%ENC_DEC %40 ; [in] cipher direction
+%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset
+%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in"
+
+%define %%B00_03 %%ZT1
+%define %%B04_07 %%ZT2
+%define %%B08_11 %%ZT3
+%define %%B12_15 %%ZT4
+
+%define %%GH1H %%ZT5 ; @note: do not change this mapping
+%define %%GH1L %%ZT6
+%define %%GH1M %%ZT7
+%define %%GH1T %%ZT8
+
+%define %%GH2H %%ZT9
+%define %%GH2L %%ZT10
+%define %%GH2M %%ZT11
+%define %%GH2T %%ZT12
+
+%define %%RED_POLY %%GH2T
+%define %%RED_P1 %%GH2L
+%define %%RED_T1 %%GH2H
+%define %%RED_T2 %%GH2M
+
+%define %%GH3H %%ZT13
+%define %%GH3L %%ZT14
+%define %%GH3M %%ZT15
+%define %%GH3T %%ZT16
+
+%define %%DATA1 %%ZT13
+%define %%DATA2 %%ZT14
+%define %%DATA3 %%ZT15
+%define %%DATA4 %%ZT16
+
+%define %%AESKEY1 %%ZT17
+%define %%AESKEY2 %%ZT18
+
+%define %%GHKEY1 %%ZT19
+%define %%GHKEY2 %%ZT20
+%define %%GHDAT1 %%ZT21
+%define %%GHDAT2 %%ZT22
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_16_blocks_overflow
+ vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_16_blocks_ok
+%%_16_blocks_overflow:
+ vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%SHFMSK
+%%_16_blocks_ok:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)]
+%ifnidn %%GHASH_IN, no_ghash_in
+ vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%else
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%endif
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; save counter for the next round
+ ;; increment counter overflow check register
+ vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)]
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+
+ vpxorq %%B00_03, %%AESKEY1
+ vpxorq %%B04_07, %%AESKEY1
+ vpxorq %%B08_11, %%AESKEY1
+ vpxorq %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)]
+
+ ;;==================================================
+ ;; GHASH 4 blocks (15 to 12)
+ vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)]
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 1
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (11 to 8)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 2
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (7 to 4)
+ vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+ vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 3
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)]
+
+ ;; =================================================
+ ;; Gather (XOR) GHASH for 12 blocks
+ vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96
+ vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96
+ vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96
+ vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 4
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load plain/cipher text (recycle GH3xx registers)
+ VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)]
+ VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)]
+ VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)]
+ VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 5
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (3 to 0)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 6
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)]
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, first_time
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM
+ vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH
+ vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL
+%endif
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ ;; also load polynomial constant for reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96
+
+ vpsrldq %%GH2M, %%GH1M, 8
+ vpslldq %%GH1M, %%GH1M, 8
+
+ vmovdqa64 XWORD(%%RED_POLY), [rel POLY2]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 7
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)]
+
+ ;; =================================================
+ ;; Add mid product to high and low
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 8
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)]
+
+ ;; =================================================
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 9
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS >= 11)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)]
+%endif
+ ;; =================================================
+ ;; first phase of reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds up to 11 (AES192) or 13 (AES256)
+ ;; AES128 is done
+%if (NROUNDS >= 11)
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS == 13)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last)
+%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last)
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00
+ vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10
+ vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts
+ ;; GH1H = GH1H x RED_T1 x RED_T2
+ vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round
+ vaesenclast %%B00_03, %%B00_03, %%AESKEY1
+ vaesenclast %%B04_07, %%B04_07, %%AESKEY1
+ vaesenclast %%B08_11, %%B08_11, %%AESKEY1
+ vaesenclast %%B12_15, %%B12_15, %%AESKEY1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; XOR against plain/cipher text
+ vpxorq %%B00_03, %%B00_03, %%DATA1
+ vpxorq %%B04_07, %%B04_07, %%DATA2
+ vpxorq %%B08_11, %%B08_11, %%DATA3
+ vpxorq %%B12_15, %%B12_15, %%DATA4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store cipher/plain text
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%B00_03, %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%B12_15, %%SHFMSK
+%else
+ vpshufb %%B00_03, %%DATA1, %%SHFMSK
+ vpshufb %%B04_07, %%DATA2, %%SHFMSK
+ vpshufb %%B08_11, %%DATA3, %%SHFMSK
+ vpshufb %%B12_15, %%DATA4, %%SHFMSK
+%endif
+
+ ;; =================================================
+ ;; store shuffled cipher text for ghashing
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15
+
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value through %%GH1H
+%endif
+
+%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 8 ciphertext blocks.
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_8 10-13
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %10 ; [out] XMM hash value
+%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum
+
+ VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL
+
+%if %0 > 10
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL
+
+ vmovdqa64 XWORD(%%ZT03), [rel POLY2]
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 7 cipher text blocks.
+;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist
+;;; - it loads GHASH keys for each of the data blocks, so that:
+;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively
+;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed
+;;; (clmul product is zero this way and will not affect the result)
+;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_7 13-16
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%ZT04 %10 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %11 ; [out] XMM hash value
+%define %%MASKREG %12 ; [clobbered] mask register to use for loads
+%define %%IA0 %13 ; [clobbered] GP temporary register
+%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum
+
+ vmovdqa64 XWORD(%%ZT04), [rel POLY2]
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7
+
+%if %0 > 13
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \
+ %%ZT01, %%ZT02, %%ZT03, \
+ %%ZTH, %%ZTM, %%ZTL, 7
+
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ mov rax, rsp
+
+ sub rsp, STACK_FRAME_SIZE
+ and rsp, ~63
+
+ mov [rsp + STACK_GP_OFFSET + 0*8], r12
+ mov [rsp + STACK_GP_OFFSET + 1*8], r13
+ mov [rsp + STACK_GP_OFFSET + 2*8], r14
+ mov [rsp + STACK_GP_OFFSET + 3*8], r15
+ mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack
+ mov r14, rax ; r14 is used to retrieve stack args
+ mov [rsp + STACK_GP_OFFSET + 5*8], rbp
+ mov [rsp + STACK_GP_OFFSET + 6*8], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + STACK_GP_OFFSET + 7*8], rdi
+ mov [rsp + STACK_GP_OFFSET + 8*8], rsi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6
+ vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7
+ vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8
+ vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9
+ vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10
+ vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11
+ vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12
+ vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13
+ vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14
+ vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15
+%endif
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%else
+ vzeroupper
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16]
+ vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16]
+ vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16]
+ vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16]
+ vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16]
+ vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16]
+ vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16]
+ vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16]
+ vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16]
+ vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16]
+%endif
+
+ ;; Required for Update/GMC_ENC
+ mov rbp, [rsp + STACK_GP_OFFSET + 5*8]
+ mov rbx, [rsp + STACK_GP_OFFSET + 6*8]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [rsp + STACK_GP_OFFSET + 7*8]
+ mov rsi, [rsp + STACK_GP_OFFSET + 8*8]
+%endif
+ mov r12, [rsp + STACK_GP_OFFSET + 0*8]
+ mov r13, [rsp + STACK_GP_OFFSET + 1*8]
+ mov r14, [rsp + STACK_GP_OFFSET + 2*8]
+ mov r15, [rsp + STACK_GP_OFFSET + 3*8]
+ mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+;;; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 21
+%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX %2 ; [in] GCM context pointer
+%define %%IV %3 ; [in] IV pointer
+%define %%A_IN %4 ; [in] AAD pointer
+%define %%A_LEN %5 ; [in] AAD length in bytes
+%define %%GPR1 %6 ; [clobbered] GP register
+%define %%GPR2 %7 ; [clobbered] GP register
+%define %%GPR3 %8 ; [clobbered] GP register
+%define %%MASKREG %9 ; [clobbered] mask register
+%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14)
+%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2)
+%define %%ZT0 %12 ; [clobbered] ZMM register
+%define %%ZT1 %13 ; [clobbered] ZMM register
+%define %%ZT2 %14 ; [clobbered] ZMM register
+%define %%ZT3 %15 ; [clobbered] ZMM register
+%define %%ZT4 %16 ; [clobbered] ZMM register
+%define %%ZT5 %17 ; [clobbered] ZMM register
+%define %%ZT6 %18 ; [clobbered] ZMM register
+%define %%ZT7 %19 ; [clobbered] ZMM register
+%define %%ZT8 %20 ; [clobbered] ZMM register
+%define %%ZT9 %21 ; [clobbered] ZMM register
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \
+ %%GPR1, %%GPR2, %%GPR3, %%MASKREG
+
+ mov %%GPR1, %%A_LEN
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length
+
+ xor %%GPR1, %%GPR1
+ mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0
+
+ ;; read 12 IV bytes and pad with 0x00000001
+ vmovdqu8 %%CUR_COUNT, [rel ONEf]
+ mov %%GPR2, %%IV
+ mov %%GPR1, 0x0000_0000_0000_0fff
+ kmovq %%MASKREG, %%GPR1
+ vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1
+
+ vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv
+
+ ;; store IV as counter in LE format
+ vpshufb %%CUR_COUNT, [rel SHUF_MASK]
+ vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher and ghash of payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;; INITIAL_BLOCKS_PARTIAL is invoked
+%macro GCM_ENC_DEC_SMALL 42
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%DATA_OFFSET %7 ; [in] data offset
+%define %%LENGTH %8 ; [in] data length
+%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16
+%define %%CTR %10 ; [in/out] XMM counter block
+%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value
+%define %%INSTANCE_TYPE %12 ; [in] single or multi call
+%define %%ZTMP0 %13 ; [clobbered] ZMM register
+%define %%ZTMP1 %14 ; [clobbered] ZMM register
+%define %%ZTMP2 %15 ; [clobbered] ZMM register
+%define %%ZTMP3 %16 ; [clobbered] ZMM register
+%define %%ZTMP4 %17 ; [clobbered] ZMM register
+%define %%ZTMP5 %18 ; [clobbered] ZMM register
+%define %%ZTMP6 %19 ; [clobbered] ZMM register
+%define %%ZTMP7 %20 ; [clobbered] ZMM register
+%define %%ZTMP8 %21 ; [clobbered] ZMM register
+%define %%ZTMP9 %22 ; [clobbered] ZMM register
+%define %%ZTMP10 %23 ; [clobbered] ZMM register
+%define %%ZTMP11 %24 ; [clobbered] ZMM register
+%define %%ZTMP12 %25 ; [clobbered] ZMM register
+%define %%ZTMP13 %26 ; [clobbered] ZMM register
+%define %%ZTMP14 %27 ; [clobbered] ZMM register
+%define %%ZTMP15 %28 ; [clobbered] ZMM register
+%define %%ZTMP16 %29 ; [clobbered] ZMM register
+%define %%ZTMP17 %30 ; [clobbered] ZMM register
+%define %%ZTMP18 %31 ; [clobbered] ZMM register
+%define %%ZTMP19 %32 ; [clobbered] ZMM register
+%define %%ZTMP20 %33 ; [clobbered] ZMM register
+%define %%ZTMP21 %34 ; [clobbered] ZMM register
+%define %%ZTMP22 %35 ; [clobbered] ZMM register
+%define %%GH %36 ; [in] ZMM ghash sum (high)
+%define %%GL %37 ; [in] ZMM ghash sum (low)
+%define %%GM %38 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %39 ; [clobbered] GP register
+%define %%IA1 %40 ; [clobbered] GP register
+%define %%MASKREG %41 ; [clobbered] mask register
+%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask
+
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ jl %%_small_initial_num_blocks_is_7_1
+
+
+ cmp %%NUM_BLOCKS, 12
+ je %%_small_initial_num_blocks_is_12
+ jl %%_small_initial_num_blocks_is_11_9
+
+ ;; 16, 15, 14 or 13
+ cmp %%NUM_BLOCKS, 16
+ je %%_small_initial_num_blocks_is_16
+ cmp %%NUM_BLOCKS, 15
+ je %%_small_initial_num_blocks_is_15
+ cmp %%NUM_BLOCKS, 14
+ je %%_small_initial_num_blocks_is_14
+ jmp %%_small_initial_num_blocks_is_13
+
+%%_small_initial_num_blocks_is_11_9:
+ ;; 11, 10 or 9
+ cmp %%NUM_BLOCKS, 11
+ je %%_small_initial_num_blocks_is_11
+ cmp %%NUM_BLOCKS, 10
+ je %%_small_initial_num_blocks_is_10
+ jmp %%_small_initial_num_blocks_is_9
+
+%%_small_initial_num_blocks_is_7_1:
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ jl %%_small_initial_num_blocks_is_3_1
+ ;; 7, 6 or 5
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ jmp %%_small_initial_num_blocks_is_5
+
+%%_small_initial_num_blocks_is_3_1:
+ ;; 3, 2 or 1
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed
+
+ ;; Use rep to generate different block size variants
+ ;; - one block size has to be the first one
+%assign num_blocks 1
+%rep 16
+%%_small_initial_num_blocks_is_ %+ num_blocks :
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \
+ %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
+ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \
+ %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFMASK
+%if num_blocks != 16
+ jmp %%_small_initial_blocks_encrypted
+%endif
+%assign num_blocks (num_blocks + 1)
+%endrep
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and zmm0-zmm31, k1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection
+
+%define %%IA0 r10
+%define %%IA1 r12
+%define %%IA2 r13
+%define %%IA3 r15
+%define %%IA4 r11
+%define %%IA5 rax
+
+%define %%LENGTH %%IA2
+%define %%CTR_CHECK %%IA3
+%define %%DATA_OFFSET %%IA4
+
+%define %%HASHK_PTR %%IA5
+
+%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now
+
+%define %%AES_PARTIAL_BLOCK xmm8
+%define %%CTR_BLOCK2z zmm18
+%define %%CTR_BLOCKz zmm9
+%define %%CTR_BLOCKx xmm9
+%define %%AAD_HASHz zmm14
+%define %%AAD_HASHx xmm14
+
+;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL
+%define %%ZTMP0 zmm0
+%define %%ZTMP1 zmm3
+%define %%ZTMP2 zmm4
+%define %%ZTMP3 zmm5
+%define %%ZTMP4 zmm6
+%define %%ZTMP5 zmm7
+%define %%ZTMP6 zmm10
+%define %%ZTMP7 zmm11
+%define %%ZTMP8 zmm12
+%define %%ZTMP9 zmm13
+%define %%ZTMP10 zmm15
+%define %%ZTMP11 zmm16
+%define %%ZTMP12 zmm17
+
+;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL
+;;; - some used by8 code as well through TMPxy names
+%define %%ZTMP13 zmm19
+%define %%ZTMP14 zmm20
+%define %%ZTMP15 zmm21
+%define %%ZTMP16 zmm30 ; can be used in very/big_loop part
+%define %%ZTMP17 zmm31 ; can be used in very/big_loop part
+%define %%ZTMP18 zmm1
+%define %%ZTMP19 zmm2
+%define %%ZTMP20 zmm8
+%define %%ZTMP21 zmm22
+%define %%ZTMP22 zmm23
+
+;;; Free to use: zmm24 - zmm29
+;;; - used by by128/48 and by8
+%define %%GH zmm24
+%define %%GL zmm25
+%define %%GM zmm26
+%define %%SHUF_MASK zmm29
+%define %%CTR_BLOCK_SAVE zmm28
+
+;;; - used by by128/48 code only
+%define %%ADDBE_4x4 zmm27
+%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE
+
+;; used by8 code only
+%define %%GH4KEY %%ZTMP17
+%define %%GH8KEY %%ZTMP16
+%define %%BLK0 %%ZTMP18
+%define %%BLK1 %%ZTMP19
+%define %%ADD8BE zmm27
+%define %%ADD8LE %%ZTMP13
+
+%define %%MASKREG k1
+
+%ifdef GCM_BIG_DATA
+;; reduction every 128 blocks, depth 32 blocks
+;; @note 128 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is defined
+%assign very_big_loop_nblocks 128
+%assign very_big_loop_depth 32
+%endif
+
+;; reduction every 48 blocks, depth 32 blocks
+;; @note 48 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is not defined
+%assign big_loop_nblocks 48
+%assign big_loop_depth 32
+
+;;; Macro flow:
+;;; - for message size bigger than very_big_loop_nblocks process data
+;;; with "very_big_loop" parameters
+;;; - for message size bigger than big_loop_nblocks process data
+;;; with "big_loop" parameters
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 8
+;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%PLAIN_CYPH_LEN, 0
+%else
+ or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], %%IA0
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \
+ %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG
+%endif
+
+ ;; lift counter block from GCM_INIT to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK
+%else
+ vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in %%LENGTH
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ ;; There may be no more data if it was consumed in the partial block.
+ sub %%LENGTH, %%DATA_OFFSET
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+
+ vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK]
+ vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444]
+
+%ifdef GCM_BIG_DATA
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_message_below_very_big_nblocks
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_no_more_very_big_nblocks
+
+%%_encrypt_very_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jge %%_encrypt_very_big_nblocks
+
+%%_no_more_very_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_very_big_nblocks:
+%endif ; GCM_BIG_DATA
+
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_message_below_big_nblocks
+
+ ;; overwritten above by CTR_BLOCK_SAVE
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_no_more_big_nblocks
+
+%%_encrypt_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jge %%_encrypt_big_nblocks
+
+%%_no_more_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_big_nblocks:
+
+ ;; Less than 256 bytes will be handled by the small message code, which
+ ;; can process up to 16 x blocks (16 bytes each)
+ cmp %%LENGTH, (16 * 16)
+ jge %%_large_message_path
+
+ ;; Determine how many blocks to process
+ ;; - process one additional block if there is a partial block
+ mov %%IA1, %%LENGTH
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; %%IA1 can be in the range from 0 to 16
+
+ GCM_ENC_DEC_SMALL \
+ %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
+ %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ no_zmm, no_zmm, no_zmm, \
+ %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK
+
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ ;; Determine how many blocks to process in INITIAL
+ ;; - process one additional block in INITIAL if there is a partial block
+ mov %%IA1, %%LENGTH
+ and %%IA1, 0xff
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; Don't allow 8 INITIAL blocks since this will
+ ;; be handled by the x8 partial loop.
+ and %%IA1, 7
+ je %%_initial_num_blocks_is_0
+ cmp %%IA1, 1
+ je %%_initial_num_blocks_is_1
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+
+%assign number_of_blocks 7
+%rep 8
+%%_initial_num_blocks_is_ %+ number_of_blocks:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block
+%if number_of_blocks != 0
+ jmp %%_initial_blocks_encrypted
+%endif
+%assign number_of_blocks (number_of_blocks - 1)
+%endrep
+
+%%_initial_blocks_encrypted:
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ ;; move cipher blocks from intial blocks to input of by8 macro
+ ;; and for GHASH_LAST_8/7
+ ;; - ghash value already xor'ed into block 0
+ vmovdqa64 %%BLK0, %%ZTMP0
+ vmovdqa64 %%BLK1, %%ZTMP1
+
+ ;; The entire message cannot get processed in INITIAL_BLOCKS
+ ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks
+ ;; - INITIAL_BLOCKS processes up to 15 blocks
+ ;; - no need to check for zero length at this stage
+
+ ;; In order to have only one reduction at the end
+ ;; start HASH KEY pointer needs to be determined based on length and
+ ;; call type.
+ ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and
+ ;; subtracted from LENGTH
+ lea %%IA1, [%%LENGTH + (8 * 16)]
+ add %%IA1, 15
+ and %%IA1, 0x3f0
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; if partial block and multi_call then change hash key start by one
+ mov %%IA0, %%LENGTH
+ and %%IA0, 15
+ add %%IA0, 15
+ and %%IA0, 16
+ sub %%IA1, %%IA0
+%endif
+ lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16]
+ sub %%HASHK_PTR, %%IA1
+ ;; HASHK_PTR
+ ;; - points at the first hash key to start GHASH with
+ ;; - needs to be updated as the message is processed (incremented)
+
+ ;; pre-load constants
+ vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888]
+ vmovdqa64 %%ADD8LE, [rel ddq_add_8888]
+ vpxorq %%GH, %%GH
+ vpxorq %%GL, %%GL
+ vpxorq %%GM, %%GM
+
+ ;; prepare counter 8 blocks
+ vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0
+ vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678]
+ vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234]
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ ;; Process 7 full blocks plus a partial block
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter
+ ;; without shuffling it back into little endian.
+ ;; %%CTR_CHECK keeps track of when we need to increment in order so
+ ;; that the carry is handled correctly.
+
+ vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE)
+
+%%_encrypt_by_8_new:
+ and WORD(%%CTR_CHECK), 255
+ add WORD(%%CTR_CHECK), 8
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jz %%_encrypt_done
+
+ cmp WORD(%%CTR_CHECK), (256 - 8)
+ jae %%_encrypt_by_8
+
+ vpaddd %%CTR_BLOCKz, %%ADD8BE
+ vpaddd %%CTR_BLOCK2z, %%ADD8BE
+
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+ jmp %%_encrypt_by_8_new
+
+%%_encrypt_by_8:
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+ vpaddd %%CTR_BLOCKz, %%ADD8LE
+ vpaddd %%CTR_BLOCK2z, %%ADD8LE
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ cmp %%LENGTH, 128
+ jge %%_encrypt_by_8_new
+
+%%_encrypt_by_8_partial:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ ;; 'in_order' shuffle needed to align key for partial block xor.
+ ;; 'out_order' is a little faster because it avoids extra shuffles.
+ ;; - counter blocks for the next 8 blocks are prepared and in BE format
+ ;; - we can go ahead with out_order scenario
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, (128 - 16)
+ sub %%LENGTH, (128 - 16)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK
+%endif
+
+%%_encrypt_done:
+ ;; Extract the last counter block in LE format
+ vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3
+ vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK)
+
+ ;; GHASH last cipher text blocks in xmm1-xmm8
+ ;; - if block 8th is partial in a multi-call path then skip the block
+%ifidn %%INSTANCE_TYPE, multi_call
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ jz %%_hash_last_8
+
+ ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1
+ vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3
+
+ GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM
+
+ ;; XOR the partial word into the hash
+ vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7)
+ jmp %%_ghash_done
+%%_hash_last_8:
+%endif
+ GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \
+ %%GH, %%GL, %%GM
+%%_ghash_done:
+ vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE)
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx
+%%_enc_dec_done:
+
+%endmacro ; GCM_ENC_DEC
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt/decrypt the initial 16 blocks
+%macro INITIAL_BLOCKS_16 22
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%T0 %10 ; [clobered] temporary ZMM register
+%define %%T1 %11 ; [clobered] temporary ZMM register
+%define %%T2 %12 ; [clobered] temporary ZMM register
+%define %%T3 %13 ; [clobered] temporary ZMM register
+%define %%T4 %14 ; [clobered] temporary ZMM register
+%define %%T5 %15 ; [clobered] temporary ZMM register
+%define %%T6 %16 ; [clobered] temporary ZMM register
+%define %%T7 %17 ; [clobered] temporary ZMM register
+%define %%T8 %18 ; [clobered] temporary ZMM register
+%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks
+%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset
+
+%define %%B00_03 %%T5
+%define %%B04_07 %%T6
+%define %%B08_11 %%T7
+%define %%B12_15 %%T8
+
+%assign stack_offset (%%BLK_OFFSET)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_next_16_overflow
+ vpaddd %%B00_03, %%CTR, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_next_16_ok
+%%_next_16_overflow:
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%SHUF_MASK
+%%_next_16_ok:
+ vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;; === load 16 blocks of data
+ VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)]
+ VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)]
+ VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)]
+ VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)]
+
+ ;; move to AES encryption rounds
+%assign i 0
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vpxorq %%B00_03, %%B00_03, %%T4
+ vpxorq %%B04_07, %%B04_07, %%T4
+ vpxorq %%B08_11, %%B08_11, %%T4
+ vpxorq %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+
+%rep NROUNDS
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenc %%B00_03, %%B00_03, %%T4
+ vaesenc %%B04_07, %%B04_07, %%T4
+ vaesenc %%B08_11, %%B08_11, %%T4
+ vaesenc %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+%endrep
+
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenclast %%B00_03, %%B00_03, %%T4
+ vaesenclast %%B04_07, %%B04_07, %%T4
+ vaesenclast %%B08_11, %%B08_11, %%T4
+ vaesenclast %%B12_15, %%B12_15, %%T4
+
+ ;; xor against text
+ vpxorq %%B00_03, %%B00_03, %%T0
+ vpxorq %%B04_07, %%B04_07, %%T1
+ vpxorq %%B08_11, %%B08_11, %%T2
+ vpxorq %%B12_15, %%B12_15, %%T3
+
+ ;; store
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15
+
+%ifidn %%ENC_DEC, DEC
+ ;; decryption - cipher text needs to go to GHASH phase
+ vpshufb %%B00_03, %%T0, %%SHUF_MASK
+ vpshufb %%B04_07, %%T1, %%SHUF_MASK
+ vpshufb %%B08_11, %%T2, %%SHUF_MASK
+ vpshufb %%B12_15, %%T3, %%SHUF_MASK
+%else
+ ;; encryption
+ vpshufb %%B00_03, %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%B12_15, %%SHUF_MASK
+%endif
+
+%ifnidn %%GHASH, no_ghash
+ ;; === xor cipher block 0 with GHASH for the next GHASH round
+ vpxorq %%B00_03, %%B00_03, %%GHASH
+%endif
+
+ vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03
+ vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07
+ vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11
+ vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15
+%endmacro ;INITIAL_BLOCKS_16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt the initial N x 16 blocks
+;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth)
+;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed
+;;; - A + B = N
+%macro INITIAL_BLOCKS_Nx16 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%T0 %8 ; [clobered] temporary ZMM register
+%define %%T1 %9 ; [clobered] temporary ZMM register
+%define %%T2 %10 ; [clobered] temporary ZMM register
+%define %%T3 %11 ; [clobered] temporary ZMM register
+%define %%T4 %12 ; [clobered] temporary ZMM register
+%define %%T5 %13 ; [clobered] temporary ZMM register
+%define %%T6 %14 ; [clobered] temporary ZMM register
+%define %%T7 %15 ; [clobered] temporary ZMM register
+%define %%T8 %16 ; [clobered] temporary ZMM register
+%define %%T9 %17 ; [clobered] temporary ZMM register
+%define %%T10 %18 ; [clobered] temporary ZMM register
+%define %%T11 %19 ; [clobered] temporary ZMM register
+%define %%T12 %20 ; [clobered] temporary ZMM register
+%define %%T13 %21 ; [clobered] temporary ZMM register
+%define %%T14 %22 ; [clobered] temporary ZMM register
+%define %%T15 %23 ; [clobered] temporary ZMM register
+%define %%T16 %24 ; [clobered] temporary ZMM register
+%define %%T17 %25 ; [clobered] temporary ZMM register
+%define %%T18 %26 ; [clobered] temporary ZMM register
+%define %%T19 %27 ; [clobered] temporary ZMM register
+%define %%T20 %28 ; [clobered] temporary ZMM register
+%define %%T21 %29 ; [clobered] temporary ZMM register
+%define %%T22 %30 ; [clobered] temporary ZMM register
+%define %%GH %31 ; [out] ZMM ghash sum (high)
+%define %%GL %32 ; [out] ZMM ghash sum (low)
+%define %%GM %33 ; [out] ZMM ghash sum (middle)
+%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16
+%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16)
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NBLOCKS
+%assign data_in_out_offset 0
+
+ ;; set up CTR_CHECK
+ vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR)
+ and DWORD(%%CTR_CHECK), 255
+
+ ;; in LE format after init, convert to BE
+ vshufi64x2 %%CTR, %%CTR, %%CTR, 0
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+
+ ;; ==== AES lead in
+
+ ;; first 16 blocks - just cipher
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; ==== GHASH + AES follows
+
+ ;; first 16 blocks stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%if ((%%NBLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16)
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+ ;; mid 16 blocks - stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+ add %%DATA_OFFSET, (%%NBLOCKS * 16)
+
+%endmacro ;INITIAL_BLOCKS_Nx16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code)
+%macro GHASH_LAST_Nx16 23
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%T10 %12 ; [clobbered] temporary ZMM
+%define %%T11 %13 ; [clobbered] temporary ZMM
+%define %%T12 %14 ; [clobbered] temporary ZMM
+%define %%T13 %15 ; [clobbered] temporary ZMM
+%define %%T14 %16 ; [clobbered] temporary ZMM
+%define %%T15 %17 ; [clobbered] temporary ZMM
+%define %%T16 %18 ; [clobbered] temporary ZMM
+%define %%GH %19 ; [in/cloberred] ghash sum (high)
+%define %%GL %20 ; [in/cloberred] ghash sum (low)
+%define %%GM %21 ; [in/cloberred] ghash sum (medium)
+%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop
+%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes)
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%T2H %%T9
+%define %%T2L %%T10
+%define %%T2M1 %%T11
+%define %%T2M2 %%T12
+
+%define %%BLK1 %%T13
+%define %%BLK2 %%T14
+
+%define %%HK1 %%T15
+%define %%HK2 %%T16
+
+%assign hashk HashKey_ %+ %%DEPTH_BLK
+%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16))
+
+ ;; load cipher blocks and ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH
+ vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL
+ vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM
+ vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2
+
+%rep ((%%DEPTH_BLK - 8) / 8)
+%assign hashk (hashk + 128)
+%assign cipher_blk (cipher_blk + 128)
+
+ ;; remaining blocks
+ ;; load next 8 cipher blocks and corresponding ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ ;; update sums
+ vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H
+ vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L
+ vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1
+ vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2
+%endrep
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T2M1
+ VHPXORI4x128 %%T0L, %%T2M2
+
+ ;; reduction
+ vmovdqa64 %%HK1, [rel POLY2]
+ VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt & ghash multiples of 16 blocks
+
+%macro GHASH_ENCRYPT_Nx16_PARALLEL 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%GDATA_KEY %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM last counter block
+%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT0 %7 ; [clobered] temporary ZMM register
+%define %%ZT1 %8 ; [clobered] temporary ZMM register
+%define %%ZT2 %9 ; [clobered] temporary ZMM register
+%define %%ZT3 %10 ; [clobered] temporary ZMM register
+%define %%ZT4 %11 ; [clobered] temporary ZMM register
+%define %%ZT5 %12 ; [clobered] temporary ZMM register
+%define %%ZT6 %13 ; [clobered] temporary ZMM register
+%define %%ZT7 %14 ; [clobered] temporary ZMM register
+%define %%ZT8 %15 ; [clobered] temporary ZMM register
+%define %%ZT9 %16 ; [clobered] temporary ZMM register
+%define %%ZT10 %17 ; [clobered] temporary ZMM register
+%define %%ZT11 %18 ; [clobered] temporary ZMM register
+%define %%ZT12 %19 ; [clobered] temporary ZMM register
+%define %%ZT13 %20 ; [clobered] temporary ZMM register
+%define %%ZT14 %21 ; [clobered] temporary ZMM register
+%define %%ZT15 %22 ; [clobered] temporary ZMM register
+%define %%ZT16 %23 ; [clobered] temporary ZMM register
+%define %%ZT17 %24 ; [clobered] temporary ZMM register
+%define %%ZT18 %25 ; [clobered] temporary ZMM register
+%define %%ZT19 %26 ; [clobered] temporary ZMM register
+%define %%ZT20 %27 ; [clobered] temporary ZMM register
+%define %%ZT21 %28 ; [clobered] temporary ZMM register
+%define %%ZT22 %29 ; [clobered] temporary ZMM register
+%define %%GTH %30 ; [in/out] ZMM GHASH sum (high)
+%define %%GTL %31 ; [in/out] ZMM GHASH sum (low)
+%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium)
+%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value
+%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop
+%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks
+%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16))
+%assign hkey_offset HashKey_ %+ %%DEPTH_BLK
+%assign data_in_out_offset 0
+
+ ;; mid 16 blocks
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; 16 blocks with reduction
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS
+
+ ;; === xor cipher block 0 with GHASH (ZT4)
+ vmovdqa64 %%GHASH, %%ZT4
+
+ ;; start the pipeline again
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ first_time, %%ENC_DEC, data_in_out_offset, %%GHASH
+
+%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16)
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+
+ add %%DATA_OFFSET, (%%NUM_BLOCKS * 16)
+
+%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL
+;;; ===========================================================================
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx_15 r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_vaes_avx512 /
+; aes_gcm_precomp_192_vaes_avx512 /
+; aes_gcm_precomp_256_vaes_avx512
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ FUNC_SAVE
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+ FUNC_RESTORE
+exit_precomp:
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+
+exit_init:
+
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
+; aes_gcm_enc_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
+; aes_gcm_dec_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
+; aes_gcm_enc_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+ FUNC_RESTORE
+
+exit_enc_fin:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
+; aes_gcm_dec_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+ FUNC_RESTORE
+
+exit_dec_fin:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm
new file mode 100644
index 000000000..449229531
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_192_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_vaes_avx512
+%define NUM_KEYS 13
+%include "avx512/mb_mgr_aes_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm
new file mode 100644
index 000000000..3bbb30158
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_192_vaes_avx512
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_vaes_avx512
+%define NUM_KEYS 13
+%include "avx512/mb_mgr_aes_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm
new file mode 100644
index 000000000..2ff448393
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_256_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_vaes_avx512
+%define NUM_KEYS 15
+%include "avx512/mb_mgr_aes_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm
new file mode 100644
index 000000000..4db4629e2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_256_vaes_avx512
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_vaes_avx512
+%define NUM_KEYS 15
+%include "avx512/mb_mgr_aes_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm
new file mode 100644
index 000000000..4a52ed1e6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm
@@ -0,0 +1,320 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+%include "include/reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X16
+%define AES_CBC_ENC_X16 aes_cbc_enc_128_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_vaes_avx512
+%define NUM_KEYS 11
+%endif
+
+; void AES_CBC_ENC_X16(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X16
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define good_lane rdx
+%define iv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp rbp
+%define idx rbp
+
+%define tmp3 r8
+%define tmp4 r9
+%endif
+
+; copy IV into NULL lanes
+%macro COPY_IV_TO_NULL_LANES 4
+%define %%IDX %1 ; [in] GP with good lane idx (scaled x16)
+%define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %3 ; [clobbered] temp XMM reg
+%define %%MASK_REG %4 ; [in] mask register
+
+ vmovdqa64 %%XTMP, [state + _aes_args_IV + %%IDX]
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign i 0
+%rep 16
+ bt %%NULL_MASK, i
+ jnc %%_skip_copy %+ i
+ vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP
+%%_skip_copy %+ i:
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+; clear IV into NULL lanes
+%macro CLEAR_IV_IN_NULL_LANES 3
+%define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %2 ; [clobbered] temp XMM reg
+%define %%MASK_REG %3 ; [in] mask register
+
+ vpxorq %%XTMP, %%XTMP
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign i 0
+%rep 16
+ bt %%NULL_MASK, i
+ jnc %%_skip_clear %+ i
+ vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP
+%%_skip_clear %+ i:
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+; copy round key's into NULL lanes
+%macro COPY_KEYS_TO_NULL_LANES 5
+%define %%IDX %1 ; [in] GP with good lane idx (scaled x16)
+%define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask
+%define %%KEY_TAB %3 ; [clobbered] GP to store key table pointer
+%define %%XTMP %4 ; [clobbered] temp XMM reg
+%define %%MASK_REG %5 ; [in] mask register
+
+ lea %%KEY_TAB, [state + _aes_args_key_tab]
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign j 0 ; outer loop to iterate through round keys
+%rep 15
+ vmovdqa64 %%XTMP, [%%KEY_TAB + j + %%IDX]
+%assign k 0 ; inner loop to iterate through lanes
+%rep 16
+ bt %%NULL_MASK, k
+ jnc %%_skip_copy %+ j %+ _ %+ k
+ vmovdqa64 [%%KEY_TAB + j + (k*16)], %%XTMP
+%%_skip_copy %+ j %+ _ %+ k:
+%assign k (k + 1)
+%endrep
+
+%assign j (j + 256)
+%endrep
+
+%endmacro
+
+; clear round key's in NULL lanes
+%macro CLEAR_KEYS_IN_NULL_LANES 3
+%define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %2 ; [clobbered] temp XMM reg
+%define %%MASK_REG %3 ; [in] mask register
+
+ vpxorq %%XTMP, %%XTMP
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign k 0 ; outer loop to iterate through lanes
+%rep 16
+ bt %%NULL_MASK, k
+ jnc %%_skip_clear %+ k
+%assign j 0 ; inner loop to iterate through round keys
+%rep NUM_KEYS
+ vmovdqa64 [state + _aesarg_key_tab + j + (k*16)], %%XTMP
+%assign j (j + 256)
+%endrep
+%%_skip_clear %+ k:
+%assign k (k + 1)
+%endrep
+
+%endmacro
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ cmp qword [state + _aes_lanes_in_use], 0
+ je return_null
+
+ ; find a lane with a non-null job
+ vpxord zmm0, zmm0, zmm0
+ vmovdqu64 zmm1, [state + _aes_job_in_lane + (0*PTR_SZ)]
+ vmovdqu64 zmm2, [state + _aes_job_in_lane + (8*PTR_SZ)]
+ vpcmpq k1, zmm1, zmm0, 4 ; NEQ
+ vpcmpq k2, zmm2, zmm0, 4 ; NEQ
+ kmovw DWORD(tmp), k1
+ kmovw DWORD(tmp1), k2
+ mov DWORD(tmp2), DWORD(tmp1)
+ shl DWORD(tmp2), 8
+ or DWORD(tmp2), DWORD(tmp) ; mask of non-null jobs in tmp2
+ not BYTE(tmp)
+ kmovw k4, DWORD(tmp)
+ not BYTE(tmp1)
+ kmovw k5, DWORD(tmp1)
+ mov DWORD(tmp), DWORD(tmp2)
+ not WORD(tmp)
+ kmovw k6, DWORD(tmp) ; mask of NULL jobs in k4, k5 and k6
+ mov DWORD(tmp), DWORD(tmp2)
+ xor tmp2, tmp2
+ bsf WORD(tmp2), WORD(tmp) ; index of the 1st set bit in tmp2
+
+ ;; copy good lane data into NULL lanes
+ mov tmp, [state + _aes_args_in + tmp2*8]
+ vpbroadcastq zmm1, tmp
+ vmovdqa64 [state + _aes_args_in + (0*PTR_SZ)]{k4}, zmm1
+ vmovdqa64 [state + _aes_args_in + (8*PTR_SZ)]{k5}, zmm1
+ ;; - out pointer
+ mov tmp, [state + _aes_args_out + tmp2*8]
+ vpbroadcastq zmm1, tmp
+ vmovdqa64 [state + _aes_args_out + (0*PTR_SZ)]{k4}, zmm1
+ vmovdqa64 [state + _aes_args_out + (8*PTR_SZ)]{k5}, zmm1
+
+ ;; - set len to UINT16_MAX
+ mov WORD(tmp), 0xffff
+ vpbroadcastw ymm3, WORD(tmp)
+ vmovdqa64 ymm0, [state + _aes_lens]
+ vmovdqu16 ymm0{k6}, ymm3
+ vmovdqa64 [state + _aes_lens], ymm0
+
+ ;; Find min length for lanes 0-7
+ vphminposuw xmm2, xmm0
+
+ ;; scale up good lane idx before copying IV and keys
+ shl tmp2, 4
+ ;; - copy IV to null lanes
+ COPY_IV_TO_NULL_LANES tmp2, tmp1, xmm4, k6
+
+ ; extract min length of lanes 0-7
+ vpextrw DWORD(len2), xmm2, 0 ; min value
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+
+ ;; - copy round keys to null lanes
+ COPY_KEYS_TO_NULL_LANES tmp2, tmp1, tmp3, xmm4, k6
+
+ ;; Update lens and find min for lanes 8-15
+ vextracti128 xmm1, ymm0, 1
+ vphminposuw xmm2, xmm1
+ vpextrw DWORD(tmp3), xmm2, 0 ; min value
+ cmp DWORD(len2), DWORD(tmp3)
+ jle use_min
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ add DWORD(idx), 8 ; but index +8
+ mov len2, tmp3 ; min len
+use_min:
+ vpbroadcastw ymm3, WORD(len2)
+ vpsubw ymm0, ymm0, ymm3
+ vmovdqa [state + _aes_lens], ymm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X16
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+ sub qword [state + _aes_lanes_in_use], 1
+
+%ifdef SAFE_DATA
+ ; Set bit of lane of returned job
+ xor DWORD(tmp3), DWORD(tmp3)
+ bts DWORD(tmp3), DWORD(idx)
+ kmovw k1, DWORD(tmp3)
+ korw k6, k1, k6
+
+ ;; Clear IV and expanded keys of returned job and "NULL lanes"
+ ;; (k6 contains the mask of the jobs)
+ CLEAR_IV_IN_NULL_LANES tmp1, xmm0, k6
+ CLEAR_KEYS_IN_NULL_LANES tmp1, xmm0, k6
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm
new file mode 100644
index 000000000..f79d15f68
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%ifndef AES_CBC_ENC_X16
+%define AES_CBC_ENC_X16 aes_cbc_enc_128_vaes_avx512
+%define NUM_KEYS 11
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_vaes_avx512
+%endif
+
+; void AES_CBC_ENC_X16(AES_ARGS_x16 *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X16
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp r10
+%define tmp2 r11
+%define tmp3 r12
+
+%define lane r8
+
+%define iv r9
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+
+%macro INSERT_KEYS 6
+%define %%KP %1 ; [in] GP reg with pointer to expanded keys
+%define %%LANE %2 ; [in] GP reg with lane number
+%define %%NKEYS %3 ; [in] number of round keys (numerical value)
+%define %%COL %4 ; [clobbered] GP reg
+%define %%ZTMP %5 ; [clobbered] ZMM reg
+%define %%IA0 %6 ; [clobbered] GP reg
+
+
+%assign ROW (16*16)
+
+ mov %%COL, %%LANE
+ shl %%COL, 4
+ lea %%IA0, [state + _aes_args_key_tab]
+ add %%COL, %%IA0
+
+ vmovdqu64 %%ZTMP, [%%KP]
+ vextracti64x2 [%%COL + ROW*0], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*1], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*2], %%ZTMP, 2
+ vextracti64x2 [%%COL + ROW*3], %%ZTMP, 3
+
+ vmovdqu64 %%ZTMP, [%%KP + 64]
+ vextracti64x2 [%%COL + ROW*4], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*5], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*6], %%ZTMP, 2
+ vextracti64x2 [%%COL + ROW*7], %%ZTMP, 3
+
+%if %%NKEYS > 11 ; 192 or 256 - copy 4 more keys
+ vmovdqu64 %%ZTMP, [%%KP + 128]
+ vextracti64x2 [%%COL + ROW*11], %%ZTMP, 3
+%else ; 128 - copy 3 more keys
+ mov %%IA0, 0x3f
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 128]
+%endif
+ vextracti64x2 [%%COL + ROW*8], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*9], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*10], %%ZTMP, 2
+
+%if %%NKEYS == 15 ; 256 - 3 more keys
+ mov %%IA0, 0x3f
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 192]
+ vextracti64x2 [%%COL + ROW*12], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*13], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*14], %%ZTMP, 2
+%elif %%NKEYS == 13 ; 192 - 1 more key
+ mov %%IA0, 0x3
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 192]
+ vextracti64x2 [%%COL + ROW*12], %%ZTMP, 0
+%endif
+%endmacro
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov len, [job + _msg_len_to_cipher_in_bytes]
+ and len, -16 ; DOCSIS may pass size unaligned to block size
+ mov iv, [job + _iv]
+ mov [state + _aes_unused_lanes], unused_lanes
+ add qword [state + _aes_lanes_in_use], 1
+
+ mov [state + _aes_job_in_lane + lane*8], job
+
+ ;; Update lane len
+ vmovdqa64 ymm0, [state + _aes_lens]
+ mov tmp2, rcx ; save rcx
+ mov rcx, lane
+ mov tmp, 1
+ shl tmp, cl
+ mov rcx, tmp2 ; restore rcx
+ kmovq k1, tmp
+
+ vpbroadcastw ymm1, WORD(len)
+ vmovdqu16 ymm0{k1}, ymm1
+ vmovdqa64 [state + _aes_lens], ymm0
+
+ ;; Find min length for lanes 0-7
+ vphminposuw xmm2, xmm0
+
+ ;; Update input pointer
+ mov tmp, [job + _src]
+ add tmp, [job + _cipher_start_src_offset_in_bytes]
+ vmovdqu xmm1, [iv]
+ mov [state + _aes_args_in + lane*8], tmp
+
+ ;; Insert expanded keys
+ mov tmp, [job + _aes_enc_key_expanded]
+ INSERT_KEYS tmp, lane, NUM_KEYS, tmp2, zmm4, tmp3
+
+ ;; Update output pointer
+ mov tmp, [job + _dst]
+ mov [state + _aes_args_out + lane*8], tmp
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + lane], xmm1
+
+ cmp qword [state + _aes_lanes_in_use], 16
+ jne return_null
+
+ ; Find min length for lanes 8-15
+ vpextrw DWORD(len2), xmm2, 0 ; min value
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ vextracti128 xmm1, ymm0, 1
+ vphminposuw xmm2, xmm1
+ vpextrw DWORD(tmp), xmm2, 0 ; min value
+ cmp DWORD(len2), DWORD(tmp)
+ jle use_min
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ add DWORD(idx), 8 ; but index +8
+ mov len2, tmp ; min len
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw ymm3, WORD(len2)
+ vpsubw ymm0, ymm0, ymm3
+ vmovdqa [state + _aes_lens], ymm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X16
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+
+ mov [state + _aes_unused_lanes], unused_lanes
+ sub qword [state + _aes_lanes_in_use], 1
+
+%ifdef SAFE_DATA
+ ;; Clear IV
+ vpxorq xmm0, xmm0
+ shl idx, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + idx], xmm0
+
+ ;; Clear expanded keys
+%assign round 0
+%rep NUM_KEYS
+ vmovdqa [state + _aesarg_key_tab + round * (16*16) + idx], xmm0
+%assign round (round + 1)
+%endrep
+
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c
new file mode 100644
index 000000000..bd1aaef63
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c
@@ -0,0 +1,1066 @@
+/*******************************************************************************
+ Copyright (c) 2012-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define AVX512
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_zmms
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "gcm.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes128_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes128_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes192_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes256_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_3des_cbc_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_3des_cbc_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_3des_cbc_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_3des_cbc_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx512
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx512
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx512
+
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx512
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx512
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx512
+
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx512
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx512
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx512
+
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx
+
+#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx512
+#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx512
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_avx512
+#define AES_CBC_DEC_192 aes_cbc_dec_192_avx512
+#define AES_CBC_DEC_256 aes_cbc_dec_256_avx512
+
+#define AES_CNTR_128 aes_cntr_128_avx
+#define AES_CNTR_192 aes_cntr_192_avx
+#define AES_CNTR_256 aes_cntr_256_avx
+
+#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+
+#define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+#define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+#define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+#define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+#define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+#define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+
+#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx
+#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx
+
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+
+#define SUBMIT_JOB_DES_CBC_ENC submit_job_des_cbc_enc_avx512
+#define FLUSH_JOB_DES_CBC_ENC flush_job_des_cbc_enc_avx512
+
+#define SUBMIT_JOB_DES_CBC_DEC submit_job_des_cbc_dec_avx512
+#define FLUSH_JOB_DES_CBC_DEC flush_job_des_cbc_dec_avx512
+
+#define SUBMIT_JOB_3DES_CBC_ENC submit_job_3des_cbc_enc_avx512
+#define FLUSH_JOB_3DES_CBC_ENC flush_job_3des_cbc_enc_avx512
+
+#define SUBMIT_JOB_3DES_CBC_DEC submit_job_3des_cbc_dec_avx512
+#define FLUSH_JOB_3DES_CBC_DEC flush_job_3des_cbc_dec_avx512
+
+#define SUBMIT_JOB_DOCSIS_DES_ENC submit_job_docsis_des_enc_avx512
+#define FLUSH_JOB_DOCSIS_DES_ENC flush_job_docsis_des_enc_avx512
+
+#define SUBMIT_JOB_DOCSIS_DES_DEC submit_job_docsis_des_dec_avx512
+#define FLUSH_JOB_DOCSIS_DES_DEC flush_job_docsis_des_dec_avx512
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX512
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX512
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX512
+
+JOB_AES_HMAC *submit_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state);
+
+#define SUBMIT_JOB_HMAC submit_job_hmac_avx512
+#define FLUSH_JOB_HMAC flush_job_hmac_avx512
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx512
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx512
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx512
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx512
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx512
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx512
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx512
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx512
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_avx512
+#define AES_GCM_ENC_128 aes_gcm_enc_128_avx512
+#define AES_GCM_DEC_192 aes_gcm_dec_192_avx512
+#define AES_GCM_ENC_192 aes_gcm_enc_192_avx512
+#define AES_GCM_DEC_256 aes_gcm_dec_256_avx512
+#define AES_GCM_ENC_256 aes_gcm_enc_256_avx512
+
+#define AES_GCM_DEC_128_VAES aes_gcm_dec_128_vaes_avx512
+#define AES_GCM_ENC_128_VAES aes_gcm_enc_128_vaes_avx512
+#define AES_GCM_DEC_192_VAES aes_gcm_dec_192_vaes_avx512
+#define AES_GCM_ENC_192_VAES aes_gcm_enc_192_vaes_avx512
+#define AES_GCM_DEC_256_VAES aes_gcm_dec_256_vaes_avx512
+#define AES_GCM_ENC_256_VAES aes_gcm_enc_256_vaes_avx512
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx512
+#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_avx512
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx512
+#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_avx512
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_avx512
+#define FLUSH_JOB flush_job_avx512
+#define QUEUE_SIZE queue_size_avx512
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx512
+#define GET_NEXT_JOB get_next_job_avx512
+#define GET_COMPLETED_JOB get_completed_job_avx512
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX512
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX512
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_avx512
+
+void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x8
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for AVX512 arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+plain_submit_gcm_dec_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+plain_submit_gcm_enc_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_gcm_dec_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_gcm_enc_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+
+static JOB_AES_HMAC *(*submit_job_aes_gcm_enc_avx512)
+ (MB_MGR *state, JOB_AES_HMAC *job) = plain_submit_gcm_enc_avx512;
+
+static JOB_AES_HMAC *(*submit_job_aes_gcm_dec_avx512)
+ (MB_MGR *state, JOB_AES_HMAC *job) = plain_submit_gcm_dec_avx512;
+
+#endif /* NO_GCM */
+
+static JOB_AES_HMAC *(*submit_job_aes_cntr_avx512)
+ (JOB_AES_HMAC *job) = submit_job_aes_cntr_avx;
+static JOB_AES_HMAC *(*submit_job_aes_cntr_bit_avx512)
+ (JOB_AES_HMAC *job) = submit_job_aes_cntr_bit_avx;
+
+static JOB_AES_HMAC *
+vaes_submit_cntr_avx512(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_128_submit_vaes_avx512(job);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_192_submit_vaes_avx512(job);
+ else /* assume 32 bytes */
+ aes_cntr_256_submit_vaes_avx512(job);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_cntr_bit_avx512(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_128_submit_vaes_avx512(job);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_192_submit_vaes_avx512(job);
+ else /* assume 32 bytes */
+ aes_cntr_bit_256_submit_vaes_avx512(job);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+/* ====================================================================== */
+
+static JOB_AES_HMAC *
+(*submit_job_aes128_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes128_enc_avx;
+
+static JOB_AES_HMAC *
+(*submit_job_aes192_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes192_enc_avx;
+
+static JOB_AES_HMAC *
+(*submit_job_aes256_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes256_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes128_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes128_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes192_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes192_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes256_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes256_enc_avx;
+
+static void
+(*aes_cbc_dec_128_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_128_avx;
+static void
+(*aes_cbc_dec_192_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_192_avx;
+static void
+(*aes_cbc_dec_256_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_256_avx;
+
+void
+init_mb_mgr_avx512(MB_MGR *state)
+{
+ unsigned int j, vaes_support = 0;
+ uint8_t *p;
+ size_t size;
+
+ state->features = cpu_feature_adjust(state->flags,
+ cpu_feature_detect());
+
+ if (!(state->features & IMB_FEATURE_AESNI)) {
+ init_mb_mgr_sse_no_aesni(state);
+ return;
+ }
+ if ((state->features & IMB_FEATURE_VAES) == IMB_FEATURE_VAES) {
+ vaes_support = 1;
+ aes_cbc_dec_128_avx512 = aes_cbc_dec_128_vaes_avx512;
+ aes_cbc_dec_192_avx512 = aes_cbc_dec_192_vaes_avx512;
+ aes_cbc_dec_256_avx512 = aes_cbc_dec_256_vaes_avx512;
+ submit_job_aes128_enc_avx512 =
+ submit_job_aes128_enc_vaes_avx512;
+ flush_job_aes128_enc_avx512 =
+ flush_job_aes128_enc_vaes_avx512;
+ submit_job_aes192_enc_avx512 =
+ submit_job_aes192_enc_vaes_avx512;
+ flush_job_aes192_enc_avx512 =
+ flush_job_aes192_enc_vaes_avx512;
+ submit_job_aes256_enc_avx512 =
+ submit_job_aes256_enc_vaes_avx512;
+ flush_job_aes256_enc_avx512 =
+ flush_job_aes256_enc_vaes_avx512;
+ }
+
+ /* Init AES out-of-order fields */
+ if (vaes_support) {
+ /* init 16 lanes */
+ memset(state->aes128_ooo.lens, 0,
+ sizeof(state->aes128_ooo.lens));
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0,
+ sizeof(state->aes192_ooo.lens));
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes256_ooo.lens, 0,
+ sizeof(state->aes256_ooo.lens));
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+ } else {
+ /* init 8 lanes */
+ memset(state->aes128_ooo.lens, 0xFF,
+ sizeof(state->aes128_ooo.lens));
+ memset(&state->aes128_ooo.lens[0], 0,
+ sizeof(state->aes128_ooo.lens[0]) * 8);
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xF76543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0xFF,
+ sizeof(state->aes192_ooo.lens));
+ memset(&state->aes192_ooo.lens[0], 0,
+ sizeof(state->aes192_ooo.lens[0]) * 8);
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xF76543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(&state->aes256_ooo.lens, 0xFF,
+ sizeof(state->aes256_ooo.lens));
+ memset(&state->aes256_ooo.lens[0], 0,
+ sizeof(state->aes256_ooo.lens[0]) * 8);
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xF76543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+ }
+
+
+ /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block)
+ * uses same settings as AES128 CBC.
+ */
+ if (vaes_support) {
+ /* init 16 lanes */
+ memset(state->docsis_sec_ooo.lens, 0,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+ } else {
+ /* init 8 lanes */
+ memset(state->docsis_sec_ooo.lens, 0xFF,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(&state->docsis_sec_ooo.lens[0], 0,
+ sizeof(state->docsis_sec_ooo.lens[0]) * 8);
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xF76543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+ }
+
+
+ /* DES, 3DES and DOCSIS DES (DES CBC + DES CFB for partial block) */
+ /* - separate DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des_enc_ooo.lens[j] = 0;
+ state->des_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->des_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->des_enc_ooo.args, 0, sizeof(state->des_enc_ooo.args));
+
+ /* - separate DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des_dec_ooo.lens[j] = 0;
+ state->des_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->des_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->des_dec_ooo.args, 0, sizeof(state->des_dec_ooo.args));
+
+ /* - separate 3DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des3_enc_ooo.lens[j] = 0;
+ state->des3_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->des3_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des3_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->des3_enc_ooo.args, 0, sizeof(state->des3_enc_ooo.args));
+
+ /* - separate 3DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des3_dec_ooo.lens[j] = 0;
+ state->des3_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->des3_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des3_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->des3_dec_ooo.args, 0, sizeof(state->des3_dec_ooo.args));
+
+ /* - separate DOCSIS DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->docsis_des_enc_ooo.lens[j] = 0;
+ state->docsis_des_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->docsis_des_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_des_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->docsis_des_enc_ooo.args, 0,
+ sizeof(state->docsis_des_enc_ooo.args));
+
+ /* - separate DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->docsis_des_dec_ooo.lens[j] = 0;
+ state->docsis_des_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->docsis_des_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_des_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->docsis_des_dec_ooo.args, 0,
+ sizeof(state->docsis_des_dec_ooo.args));
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0;
+ state->hmac_sha_1_ooo.lens[5] = 0;
+ state->hmac_sha_1_ooo.lens[6] = 0;
+ state->hmac_sha_1_ooo.lens[7] = 0;
+ state->hmac_sha_1_ooo.lens[8] = 0;
+ state->hmac_sha_1_ooo.lens[9] = 0;
+ state->hmac_sha_1_ooo.lens[10] = 0;
+ state->hmac_sha_1_ooo.lens[11] = 0;
+ state->hmac_sha_1_ooo.lens[12] = 0;
+ state->hmac_sha_1_ooo.lens[13] = 0;
+ state->hmac_sha_1_ooo.lens[14] = 0;
+ state->hmac_sha_1_ooo.lens[15] = 0;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_1_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5 * 4] = 0x80;
+ p[64 - 2] = 0x02;
+ p[64 - 1] = 0xA0;
+ }
+
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0;
+ state->hmac_sha_224_ooo.lens[5] = 0;
+ state->hmac_sha_224_ooo.lens[6] = 0;
+ state->hmac_sha_224_ooo.lens[7] = 0;
+ state->hmac_sha_224_ooo.lens[8] = 0;
+ state->hmac_sha_224_ooo.lens[9] = 0;
+ state->hmac_sha_224_ooo.lens[10] = 0;
+ state->hmac_sha_224_ooo.lens[11] = 0;
+ state->hmac_sha_224_ooo.lens[12] = 0;
+ state->hmac_sha_224_ooo.lens[13] = 0;
+ state->hmac_sha_224_ooo.lens[14] = 0;
+ state->hmac_sha_224_ooo.lens[15] = 0;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_224_ooo.num_lanes_inuse = 0;
+ /* sha256 and sha224 are very similar except for
+ * digest constants and output size
+ */
+ for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[7 * 4] = 0x80; /* digest 7 words long */
+ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64 - 1] = 0xE0;
+ }
+
+ /* Init HMAC/SHA256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0;
+ state->hmac_sha_256_ooo.lens[5] = 0;
+ state->hmac_sha_256_ooo.lens[6] = 0;
+ state->hmac_sha_256_ooo.lens[7] = 0;
+ state->hmac_sha_256_ooo.lens[8] = 0;
+ state->hmac_sha_256_ooo.lens[9] = 0;
+ state->hmac_sha_256_ooo.lens[10] = 0;
+ state->hmac_sha_256_ooo.lens[11] = 0;
+ state->hmac_sha_256_ooo.lens[12] = 0;
+ state->hmac_sha_256_ooo.lens[13] = 0;
+ state->hmac_sha_256_ooo.lens[14] = 0;
+ state->hmac_sha_256_ooo.lens[15] = 0;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_256_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ /* hmac related */
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[8 * 4] = 0x80; /* 8 digest words */
+ p[64 - 2] = 0x03; /* length */
+ p[64 - 1] = 0x00;
+ }
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0;
+ state->hmac_sha_384_ooo.lens[3] = 0;
+ state->hmac_sha_384_ooo.lens[4] = 0;
+ state->hmac_sha_384_ooo.lens[5] = 0;
+ state->hmac_sha_384_ooo.lens[6] = 0;
+ state->hmac_sha_384_ooo.lens[7] = 0;
+ state->hmac_sha_384_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 384 bits, 1408 bits == 0x0580.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0;
+ state->hmac_sha_512_ooo.lens[3] = 0;
+ state->hmac_sha_512_ooo.lens[4] = 0;
+ state->hmac_sha_512_ooo.lens[5] = 0;
+ state->hmac_sha_512_ooo.lens[6] = 0;
+ state->hmac_sha_512_ooo.lens[7] = 0;
+ state->hmac_sha_512_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 512 bits, 1536 bits == 0x600.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0;
+ state->hmac_md5_ooo.lens[9] = 0;
+ state->hmac_md5_ooo.lens[10] = 0;
+ state->hmac_md5_ooo.lens[11] = 0;
+ state->hmac_md5_ooo.lens[12] = 0;
+ state->hmac_md5_ooo.lens[13] = 0;
+ state->hmac_md5_ooo.lens[14] = 0;
+ state->hmac_md5_ooo.lens[15] = 0;
+ state->hmac_md5_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_md5_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_md5_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[4 * 4] = 0x80;
+ p[64 - 7] = 0x02;
+ p[64 - 8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0;
+ state->aes_xcbc_ooo.lens[5] = 0;
+ state->aes_xcbc_ooo.lens[6] = 0;
+ state->aes_xcbc_ooo.lens[7] = 0;
+ state->aes_xcbc_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < 8 ; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF76543210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF76543210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set handlers */
+ state->get_next_job = get_next_job_avx512;
+ state->submit_job = submit_job_avx512;
+ state->submit_job_nocheck = submit_job_nocheck_avx512;
+ state->get_completed_job = get_completed_job_avx512;
+ state->flush_job = flush_job_avx512;
+ state->queue_size = queue_size_avx512;
+ state->keyexp_128 = aes_keyexp_128_avx512;
+ state->keyexp_192 = aes_keyexp_192_avx512;
+ state->keyexp_256 = aes_keyexp_256_avx512;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx512;
+ state->xcbc_keyexp = aes_xcbc_expand_key_avx512;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_avx512;
+ state->sha1 = sha1_avx512;
+ state->sha224_one_block = sha224_one_block_avx512;
+ state->sha224 = sha224_avx512;
+ state->sha256_one_block = sha256_one_block_avx512;
+ state->sha256 = sha256_avx512;
+ state->sha384_one_block = sha384_one_block_avx512;
+ state->sha384 = sha384_avx512;
+ state->sha512_one_block = sha512_one_block_avx512;
+ state->sha512 = sha512_avx512;
+ state->md5_one_block = md5_one_block_avx512;
+ state->aes128_cfb_one = aes_cfb_128_one_avx512;
+
+ state->eea3_1_buffer = zuc_eea3_1_buffer_avx;
+ state->eea3_4_buffer = zuc_eea3_4_buffer_avx;
+ state->eea3_n_buffer = zuc_eea3_n_buffer_avx;
+ state->eia3_1_buffer = zuc_eia3_1_buffer_avx;
+
+ state->f8_1_buffer = kasumi_f8_1_buffer_avx;
+ state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx;
+ state->f8_2_buffer = kasumi_f8_2_buffer_avx;
+ state->f8_3_buffer = kasumi_f8_3_buffer_avx;
+ state->f8_4_buffer = kasumi_f8_4_buffer_avx;
+ state->f8_n_buffer = kasumi_f8_n_buffer_avx;
+ state->f9_1_buffer = kasumi_f9_1_buffer_avx;
+ state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx;
+ state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx;
+ state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx;
+ state->kasumi_key_sched_size = kasumi_key_sched_size_avx;
+
+ state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx2;
+ state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx2;
+ state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx2;
+ state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx2;
+ state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx2;
+ state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx2;
+ state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx2;
+ state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx2;
+ state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx2;
+ state->snow3g_init_key_sched = snow3g_init_key_sched_avx2;
+ state->snow3g_key_sched_size = snow3g_key_sched_size_avx2;
+
+ if ((state->features & IMB_FEATURE_VAES) == IMB_FEATURE_VAES) {
+ submit_job_aes_cntr_avx512 = vaes_submit_cntr_avx512;
+ submit_job_aes_cntr_bit_avx512 = vaes_submit_cntr_bit_avx512;
+ }
+#ifndef NO_GCM
+ if ((state->features & (IMB_FEATURE_VAES | IMB_FEATURE_VPCLMULQDQ)) ==
+ (IMB_FEATURE_VAES | IMB_FEATURE_VPCLMULQDQ)) {
+ state->gcm128_enc = aes_gcm_enc_128_vaes_avx512;
+ state->gcm192_enc = aes_gcm_enc_192_vaes_avx512;
+ state->gcm256_enc = aes_gcm_enc_256_vaes_avx512;
+ state->gcm128_dec = aes_gcm_dec_128_vaes_avx512;
+ state->gcm192_dec = aes_gcm_dec_192_vaes_avx512;
+ state->gcm256_dec = aes_gcm_dec_256_vaes_avx512;
+ state->gcm128_init = aes_gcm_init_128_vaes_avx512;
+ state->gcm192_init = aes_gcm_init_192_vaes_avx512;
+ state->gcm256_init = aes_gcm_init_256_vaes_avx512;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_vaes_avx512;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_vaes_avx512;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_vaes_avx512;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_vaes_avx512;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_vaes_avx512;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_vaes_avx512;
+ state->gcm128_enc_finalize =
+ aes_gcm_enc_128_finalize_vaes_avx512;
+ state->gcm192_enc_finalize =
+ aes_gcm_enc_192_finalize_vaes_avx512;
+ state->gcm256_enc_finalize =
+ aes_gcm_enc_256_finalize_vaes_avx512;
+ state->gcm128_dec_finalize =
+ aes_gcm_dec_128_finalize_vaes_avx512;
+ state->gcm192_dec_finalize =
+ aes_gcm_dec_192_finalize_vaes_avx512;
+ state->gcm256_dec_finalize =
+ aes_gcm_dec_256_finalize_vaes_avx512;
+ state->gcm128_precomp = aes_gcm_precomp_128_vaes_avx512;
+ state->gcm192_precomp = aes_gcm_precomp_192_vaes_avx512;
+ state->gcm256_precomp = aes_gcm_precomp_256_vaes_avx512;
+ state->gcm128_pre = aes_gcm_pre_128_vaes_avx512;
+ state->gcm192_pre = aes_gcm_pre_192_vaes_avx512;
+ state->gcm256_pre = aes_gcm_pre_256_vaes_avx512;
+
+ submit_job_aes_gcm_enc_avx512 = vaes_submit_gcm_enc_avx512;
+ submit_job_aes_gcm_dec_avx512 = vaes_submit_gcm_dec_avx512;
+ } else {
+ state->gcm128_enc = aes_gcm_enc_128_avx512;
+ state->gcm192_enc = aes_gcm_enc_192_avx512;
+ state->gcm256_enc = aes_gcm_enc_256_avx512;
+ state->gcm128_dec = aes_gcm_dec_128_avx512;
+ state->gcm192_dec = aes_gcm_dec_192_avx512;
+ state->gcm256_dec = aes_gcm_dec_256_avx512;
+ state->gcm128_init = aes_gcm_init_128_avx512;
+ state->gcm192_init = aes_gcm_init_192_avx512;
+ state->gcm256_init = aes_gcm_init_256_avx512;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_avx512;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_avx512;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_avx512;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_avx512;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_avx512;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_avx512;
+ state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx512;
+ state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx512;
+ state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx512;
+ state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx512;
+ state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx512;
+ state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx512;
+ state->gcm128_precomp = aes_gcm_precomp_128_avx512;
+ state->gcm192_precomp = aes_gcm_precomp_192_avx512;
+ state->gcm256_precomp = aes_gcm_precomp_256_avx512;
+ state->gcm128_pre = aes_gcm_pre_128_avx512;
+ state->gcm192_pre = aes_gcm_pre_192_avx512;
+ state->gcm256_pre = aes_gcm_pre_256_avx512;
+ }
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm
new file mode 100644
index 000000000..decea625b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm
@@ -0,0 +1,524 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R8 R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX R10 R11
+;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31, K1-7 (K1-2 and K4-6 here but DES underneath clobbers K1-7).
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern docsis_des_x16_enc_avx512
+extern docsis_des_x16_dec_avx512
+extern des_x16_cbc_enc_avx512
+extern des_x16_cbc_dec_avx512
+extern des3_x16_cbc_enc_avx512
+extern des3_x16_cbc_dec_avx512
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define STATE arg1
+%define JOB arg2
+
+%define IA0 arg3
+%define IA1 arg4
+%define IA2 r10
+
+%define MIN_IDX r11
+%define MIN_LEN rax
+%define LANE r11
+
+%define AVX512_NUM_DES_LANES 16
+
+%define ZTMP0 zmm0
+%define ZTMP1 zmm1
+%define ZTMP2 zmm2
+%define ZTMP3 zmm3
+%define ZTMP4 zmm4
+%define ZTMP5 zmm5
+%define ZTMP6 zmm6
+%define ZTMP7 zmm7
+%define ZTMP8 zmm8
+%define ZTMP9 zmm9
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; DES/DOCSIS DES job submit
+;;; ===========================================================================
+;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection
+;;; ENC_DEC [in] - ENCrypt or DECrypt seection
+%macro GENERIC_DES_SUBMIT 2
+%define %%DES_DOCSIS %1
+%define %%ENC_DEC %2
+
+ ;; get unsued lane and increment number of lanes in use
+ mov IA0, [STATE + _des_unused_lanes]
+ mov LANE, IA0
+ and LANE, 0xF ;; just a nibble
+ shr IA0, 4
+ mov [STATE + _des_unused_lanes], IA0
+ add qword [STATE + _des_lanes_in_use], 1
+
+ ;; store job info in OOO structure
+ ;; - job pointer
+ mov [STATE + _des_job_in_lane + LANE*8], JOB
+ ;; - key schedule
+%ifidn %%ENC_DEC, ENC
+ mov IA2, [JOB + _aes_enc_key_expanded]
+%else
+ mov IA2, [JOB + _aes_dec_key_expanded]
+%endif
+ mov [STATE + _des_args_keys + LANE*8], IA2
+ ;; - IV
+ mov IA2, [JOB + _iv]
+ mov DWORD(IA0), [IA2]
+ mov DWORD(IA1), [IA2 + 4]
+ mov [STATE + _des_args_IV + LANE*4], DWORD(IA0)
+ mov [STATE + _des_args_IV + LANE*4 + (AVX512_NUM_DES_LANES*4)], DWORD(IA1)
+ ;; - src pointer
+ mov IA0, [JOB + _src]
+ add IA0, [JOB + _cipher_start_src_offset_in_bytes]
+ mov [STATE + _des_args_in + LANE*8], IA0
+ ;; - destination pointer
+ mov IA1, [JOB + _dst]
+ mov [STATE + _des_args_out + LANE*8], IA1
+ ;; - length in bytes (block aligned)
+ mov IA2, [JOB + _msg_len_to_cipher_in_bytes]
+ and IA2, -8
+ mov [STATE + _des_lens + LANE*2], WORD(IA2)
+%ifidn %%DES_DOCSIS, DOCSIS
+ ;; - block length
+ mov [STATE + _des_args_BLen + LANE*4], DWORD(IA2)
+ ;; - last in
+ add IA0, IA2
+ mov [STATE + _des_args_LIn + LANE*8], IA0
+ ;; - last out
+ add IA1, IA2
+ mov [STATE + _des_args_LOut + LANE*8], IA1
+ ;; - partial length
+ mov IA2, [JOB + _msg_len_to_cipher_in_bytes]
+ and IA2, 7
+ mov [STATE + _des_args_PLen + LANE*4], DWORD(IA2)
+%endif ; DOCSIS
+ ;; is there enough jobs to process them in parallel?
+ cmp qword [STATE + _des_lanes_in_use], AVX512_NUM_DES_LANES
+ jb %%_des_submit_null_end
+ ;; schedule the processing
+ ;; - find min job size
+ vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP0)
+ vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP1)
+ vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value
+ cmp DWORD(MIN_LEN), DWORD(IA2)
+ jle %%_use_min
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ add DWORD(MIN_IDX), 8 ; but index +8
+ mov MIN_LEN, IA2 ; min len
+%%_use_min:
+ cmp MIN_LEN, 0
+ je %%_len_is_0
+
+ vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN)
+ vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0)
+ vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1)
+
+ push MIN_IDX
+ mov arg2, MIN_LEN
+%ifidn %%ENC_DEC, ENC
+ ;; encrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_enc_avx512
+%endif
+%else ; ENC
+ ;; decrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_dec_avx512
+%endif
+%endif ; DEC
+ pop MIN_IDX
+ jmp %%_des_submit_end
+
+%%_des_submit_null_end:
+ xor rax, rax
+ jmp %%_des_submit_return
+
+%%_len_is_0:
+%ifidn %%DES_DOCSIS, DOCSIS
+ cmp dword [STATE + _des_args_PLen + MIN_IDX*4], 0
+ jz %%_des_submit_end
+ push MIN_IDX
+ xor arg2, arg2 ; len is 0
+%ifidn %%ENC_DEC, ENC
+ call docsis_des_x16_enc_avx512
+%else ; ENC
+ call docsis_des_x16_dec_avx512
+%endif ; DEC
+ pop MIN_IDX
+%endif ; DOCSIS
+ ;; fall trough
+%%_des_submit_end:
+ ;; return a job
+ ;; - decrement number of jobs in use
+ sub qword [STATE + _des_lanes_in_use], 1
+ ;; - put the lane back to free lanes pool
+ mov IA0, [STATE + _des_unused_lanes]
+ shl IA0, 4
+ or IA0, MIN_IDX
+ mov [STATE + _des_unused_lanes], IA0
+ ;; - mark job as complete
+ ;; - clear job pointer
+ mov rax, [STATE + _des_job_in_lane + MIN_IDX*8]
+ mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0
+ or dword [rax + _status], STS_COMPLETED_AES
+
+%ifdef SAFE_DATA
+ ;; Clear IV
+ mov dword [STATE + _des_args_IV + MIN_IDX*4], 0
+ mov dword [STATE + _des_args_IV + MIN_IDX*4 + (AVX512_NUM_DES_LANES*4)], 0
+%endif
+ vzeroupper
+%%_des_submit_return:
+%endmacro
+
+;;; ===========================================================================
+;;; DES/DOCSIS DES flush
+;;; ===========================================================================
+;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection
+;;; ENC_DEC [in] - ENCrypt or DECrypt selection
+;;;
+;;; Clobbers k1, k2, k4, k5 and k6
+%macro GENERIC_DES_FLUSH 2
+%define %%DES_DOCSIS %1
+%define %%ENC_DEC %2
+
+ cmp qword [STATE + _des_lanes_in_use], 0
+ je %%_des_flush_null_end
+
+ ;; find non-null job
+ vpxord ZTMP0, ZTMP0, ZTMP0
+ vmovdqu64 ZTMP1, [STATE + _des_job_in_lane + (0*PTR_SZ)]
+ vmovdqu64 ZTMP2, [STATE + _des_job_in_lane + (8*PTR_SZ)]
+ vpcmpq k1, ZTMP1, ZTMP0, 4 ; NEQ
+ vpcmpq k2, ZTMP2, ZTMP0, 4 ; NEQ
+ xor IA0, IA0
+ xor IA1, IA1
+ kmovw DWORD(IA0), k1
+ kmovw DWORD(IA1), k2
+ mov DWORD(IA2), DWORD(IA1)
+ shl DWORD(IA2), 8
+ or DWORD(IA2), DWORD(IA0) ; mask of non-null jobs in IA2
+ not BYTE(IA0)
+ kmovw k4, DWORD(IA0)
+ not BYTE(IA1)
+ kmovw k5, DWORD(IA1)
+ mov DWORD(IA0), DWORD(IA2)
+ not WORD(IA0)
+ kmovw k6, DWORD(IA0) ; mask of NULL jobs in k4, k5 and k6
+ mov DWORD(IA0), DWORD(IA2)
+ xor IA2, IA2
+ bsf WORD(IA2), WORD(IA0) ; index of the 1st set bit in IA2
+
+ ;; copy good lane data into NULL lanes
+ ;; - k1(L8)/k2(H8) - masks of non-null jobs
+ ;; - k4(L8)/k5(H8)/k6 - masks of NULL jobs
+ ;; - IA2 index of 1st non-null job
+
+ ;; - in pointer
+ mov IA0, [STATE + _des_args_in + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_in + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_in + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - out pointer
+ mov IA0, [STATE + _des_args_out + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_out + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_out + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - key schedule
+ mov IA0, [STATE + _des_args_keys + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_keys + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_keys + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - zero partial len
+ vmovdqu32 [STATE + _des_args_PLen]{k6}, ZTMP0
+ ;; - set len to UINT16_MAX
+ mov WORD(IA0), 0xffff
+ vpbroadcastw ZTMP1, WORD(IA0)
+ vmovdqu16 [STATE + _des_lens]{k6}, ZTMP1
+
+ ;; - IV
+ mov DWORD(IA0), [STATE + _des_args_IV + IA2*4]
+ mov DWORD(IA1), [STATE + _des_args_IV + IA2*4 + (16*4)]
+ vpbroadcastd ZTMP1, DWORD(IA0)
+ vpbroadcastd ZTMP2, DWORD(IA1)
+ vmovdqu32 [STATE + _des_args_IV]{k6}, ZTMP1
+ vmovdqu32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP2
+
+ ;; schedule the processing
+ ;; - find min job size
+ vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP0)
+ vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP1)
+ vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value
+ cmp DWORD(MIN_LEN), DWORD(IA2)
+ jle %%_use_min
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ add DWORD(MIN_IDX), 8 ; but index +8
+ mov MIN_LEN, IA2 ; min len
+%%_use_min:
+ vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN)
+ vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0)
+ vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1)
+
+ push MIN_IDX
+%ifdef SAFE_DATA
+ ;; Save k6, which may be clobbered by following functions
+ kmovq IA0, k6
+ push IA0
+%endif
+
+ mov arg2, MIN_LEN
+%ifidn %%ENC_DEC, ENC
+ ;; encrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_enc_avx512
+%endif
+%else ; ENC
+ ;; decrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_dec_avx512
+%endif
+%endif ; DEC
+%ifdef SAFE_DATA
+ ;; Restore k6, which may have been clobbered by previous functions
+ pop IA0
+ kmovq k6, IA0
+%endif
+ pop MIN_IDX
+ jmp %%_des_flush_end
+
+%%_des_flush_null_end:
+ xor rax, rax
+ jmp %%_des_flush_return
+%%_des_flush_end:
+ ;; return a job
+ ;; - decrement number of jobs in use
+ sub qword [STATE + _des_lanes_in_use], 1
+ ;; - put the lane back to free lanes pool
+ mov IA0, [STATE + _des_unused_lanes]
+ shl IA0, 4
+ or IA0, MIN_IDX
+ mov [STATE + _des_unused_lanes], IA0
+ ;; - mark job as complete
+ mov rax, [STATE + _des_job_in_lane + MIN_IDX*8]
+ or dword [rax + _status], STS_COMPLETED_AES
+ ;; - clear job pointer
+ mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0
+%ifdef SAFE_DATA
+ ; Set bit of lane of returned job
+ xor DWORD(IA0), DWORD(IA0)
+ bts DWORD(IA0), DWORD(MIN_IDX)
+ kmovd k1, DWORD(IA0)
+ kord k6, k1, k6
+
+ ;; Clear IV of returned job and "NULL lanes" (k6 contains the mask of the jobs)
+ vpxorq ZTMP1, ZTMP1
+ vmovdqa32 [STATE + _des_args_IV]{k6}, ZTMP1
+ vmovdqa32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP1
+%endif
+%%_des_flush_return:
+ vzeroupper
+%endmacro
+
+;;; ========================================================
+;;; DATA
+
+section .data
+default rel
+
+;;; ========================================================
+;;; CODE
+section .text
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_des_cbc_enc_avx512,function,internal)
+submit_job_des_cbc_enc_avx512:
+ GENERIC_DES_SUBMIT DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_des_cbc_dec_avx512,function,internal)
+submit_job_des_cbc_dec_avx512:
+ GENERIC_DES_SUBMIT DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_docsis_des_enc_avx512,function,internal)
+submit_job_docsis_des_enc_avx512:
+ GENERIC_DES_SUBMIT DOCSIS, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_docsis_des_dec_avx512,function,internal)
+submit_job_docsis_des_dec_avx512:
+ GENERIC_DES_SUBMIT DOCSIS, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_3des_cbc_enc_avx512,function,internal)
+submit_job_3des_cbc_enc_avx512:
+ GENERIC_DES_SUBMIT 3DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_3des_cbc_dec_avx512,function,internal)
+submit_job_3des_cbc_dec_avx512:
+ GENERIC_DES_SUBMIT 3DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_des_cbc_enc_avx512,function,internal)
+flush_job_des_cbc_enc_avx512:
+ GENERIC_DES_FLUSH DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_des_cbc_dec_avx512,function,internal)
+flush_job_des_cbc_dec_avx512:
+ GENERIC_DES_FLUSH DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_docsis_des_enc_avx512,function,internal)
+flush_job_docsis_des_enc_avx512:
+ GENERIC_DES_FLUSH DOCSIS, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_docsis_des_dec_avx512,function,internal)
+flush_job_docsis_des_dec_avx512:
+ GENERIC_DES_FLUSH DOCSIS, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_3des_cbc_enc_avx512,function,internal)
+flush_job_3des_cbc_enc_avx512:
+ GENERIC_DES_FLUSH 3DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_3des_cbc_dec_avx512,function,internal)
+flush_job_3des_cbc_dec_avx512:
+ GENERIC_DES_FLUSH 3DES, DEC
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm
new file mode 100644
index 000000000..5fa08053f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm
@@ -0,0 +1,367 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RCX RDX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+align 32
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in rbx, rdi, rbp
+%define idx rbp
+
+%define unused_lanes r9
+%define lane_data r9
+%define tmp2 r9
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; we clobber rbp, called routine clobbers r12-r15
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_avx512,function,internal)
+flush_job_hmac_avx512:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align stack to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+ mov [rsp + _rsp_save], rax
+
+ DBGPRINTL "---------- start hmac flush avx512 -----------"
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1] ;empty?
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ vmovdqa ymm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ vpor ymm0, ymm0, [rel len_masks + 32*I] ; 32 for ymm, 16 for xmm
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ vmovdqa [state + _lens], ymm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ DBGPRINTL64 "FLUSH min_length", len2
+ DBGPRINTL64 "FLUSH min_length index ", idx
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens + 8*2], xmm2
+ DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (0..7)", xmm0
+ DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (8..F)", xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ DBGPRINTL "FLUSH *** ---------- return null"
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4 ;; a nibble
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha1], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r12), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(r12)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(r12)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(r13), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r14), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(r13)
+ bswap DWORD(r14)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(r13)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(r14)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqu64 [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ DBGPRINTL "---------- exit hmac flush avx512 -----------"
+ vzeroupper
+
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+ mov rsp, [rsp + _rsp_save]
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm
new file mode 100644
index 000000000..656e854d5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "avx512/mb_mgr_hmac_sha_256_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm
new file mode 100644
index 000000000..60a98918a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "avx512/mb_mgr_hmac_sha_256_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm
new file mode 100644
index 000000000..023eb3454
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm
@@ -0,0 +1,433 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_x16_avx512
+
+section .data
+default rel
+align 16
+byteswap:
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 32
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r15
+%define idx rbp
+
+%define unused_lanes r10
+%define tmp5 r10
+
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 arg3
+%define tmp r9
+
+%define len_upper r13
+%define idx_upper r14
+
+
+; we clobber rsi, rbp; called routine also clobbers rax, r9 to r15
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
+; JOB* flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : state
+align 32
+%ifdef SHA224
+MKGLOBAL(flush_job_hmac_sha_224_avx512,function,internal)
+flush_job_hmac_sha_224_avx512:
+%else
+MKGLOBAL(flush_job_hmac_sha_256_avx512,function,internal)
+flush_job_hmac_sha_256_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; if bit (32+3) is set, then all lanes are empty
+ cmp dword [state + _num_lanes_inuse_sha256], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ vmovdqa ymm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*I], tmp
+ vpor ymm0, ymm0, [rel len_masks + 32*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha256 ], ymm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens_sha256 + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens_sha256 + 8*2], xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov job, [lane_data + _job_in_lane]
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha256], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy SHA224 14 bytes / SHA256 16 bytes
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy SHA224 28 bytes / SHA256 32 bytes
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+ mov [p + 3*4], DWORD(tmp5)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+%ifndef SHA224
+ bswap DWORD(tmp5)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp4)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa64 [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu64 [lane_data + _outer_block], ymm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm
new file mode 100644
index 000000000..baadef492
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm
@@ -0,0 +1,445 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in rbp, r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define p2 rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset arg3
+%define tmp2 arg3
+
+%define lane arg4
+%define tmp3 arg4
+
+%define extra_blocks r8
+%define tmp r9
+%define lane_data r10
+
+%define len_upper r13
+%define idx_upper r14
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r9 to r15
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+ align 32
+%ifdef SHA224
+MKGLOBAL(submit_job_hmac_sha_224_avx512,function,internal)
+submit_job_hmac_sha_224_avx512:
+%else
+MKGLOBAL(submit_job_hmac_sha_256_avx512,function,internal)
+submit_job_hmac_sha_256_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ add dword [state + _num_lanes_inuse_sha256], 1
+
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens_sha256 + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ vmovdqu32 zmm0, [p - 64 + len]
+ vmovdqu32 [lane_data + _extra_block], zmm0
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp dword [state + _num_lanes_inuse_sha256], 0x10 ; all 16 lanes used?
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha256]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens_sha256 + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx is in range 0..7
+ add idx, 8 ; to reflect that real index is in 8..F range
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256 + 0*2], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens_sha256 + 8*2], xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+ sub dword [state + _num_lanes_inuse_sha256], 1
+
+ vzeroupper
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy 14 bytes for SHA224 // 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+ jmp clear_ret
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 // 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa64 [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu64 [lane_data + _outer_block], ymm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm
new file mode 100644
index 000000000..698052730
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm
@@ -0,0 +1,29 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define SHA384
+%include "avx512/mb_mgr_hmac_sha_512_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm
new file mode 100644
index 000000000..0e9f611de
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm
@@ -0,0 +1,29 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define SHA384
+%include "avx512/mb_mgr_hmac_sha_512_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm
new file mode 100644
index 000000000..7d7e56b40
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm
@@ -0,0 +1,384 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, R12-R15
+;;
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x8_avx512
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define tmp5 r9
+%define tmp6 r10
+
+struc STACK
+_gpr_save: resq 7 ; rbx, rbp, r12-r15, rdi (windows)
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+%ifndef SHA384
+; JOB* flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : state
+%define SHA_X_DIGEST_SIZE 512
+MKGLOBAL(flush_job_hmac_sha_512_avx512,function,internal)
+align 64
+flush_job_hmac_sha_512_avx512:
+%else
+; JOB* flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : state
+%define SHA_X_DIGEST_SIZE 384
+MKGLOBAL(flush_job_hmac_sha_384_avx512,function,internal)
+align 64
+flush_job_hmac_sha_384_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel APPEND(lane_, I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha512], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+ ; move digest into data location
+ %assign I 0
+ %rep (SHA_X_DIGEST_SIZE / (8*16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0
+ %assign I (I+1)
+ %endrep
+
+ ; move the opad key into digest
+ mov tmp, [job + _auth_key_xor_opad]
+
+ %assign I 0
+ %rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+
+ jmp copy_lane_data
+
+ align 32
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 32
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+ mov [p + 3*8], QWORD(tmp5)
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 4*8], QWORD(tmp2)
+ mov [p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp6)
+ mov [p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+ ;; Clear first 128 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+ vmovdqu64 [lane_data + _extra_block + 64], zmm0
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqu64 [lane_data + _outer_block], ymm0
+ vmovdqa64 [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu64 [lane_data + _outer_block], zmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm
new file mode 100644
index 000000000..a2b66e54f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm
@@ -0,0 +1,413 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+extern sha512_x8_avx512
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r13, r14, r16
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset arg3
+%define tmp2 arg3
+
+%define lane arg4
+%define tmp3 arg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+; Define stack usage
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+align 64
+%ifndef SHA384
+MKGLOBAL(submit_job_hmac_sha_512_avx512,function,internal)
+%define SHA_X_DIGEST_SIZE 512
+submit_job_hmac_sha_512_avx512:
+%else
+MKGLOBAL(submit_job_hmac_sha_384_avx512,function,internal)
+%define SHA_X_DIGEST_SIZE 384
+submit_job_hmac_sha_384_avx512:
+%endif
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov lane, unused_lanes
+ and lane, 15
+ shr unused_lanes, 4
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+ mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+ vmovdqu32 zmm0, [p - 128 + 0*64]
+ vmovdqu32 zmm1, [p - 128 + 1*64]
+ vmovdqu32 [lane_data + _extra_block_sha512 + 0*64], zmm0
+ vmovdqu32 [lane_data + _extra_block_sha512 + 1*64], zmm1
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 32
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha512]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ jmp start_loop
+
+ align 32
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 32
+copy_lt128:
+ ;; less than one message block of data
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 32
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ ;; Clear first 128 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+ vmovdqu64 [lane_data + _extra_block + 64], zmm0
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqu64 [lane_data + _outer_block], ymm0
+ vmovdqa64 [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu64 [lane_data + _outer_block], zmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm
new file mode 100644
index 000000000..2fe8482a9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm
@@ -0,0 +1,402 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rdi, rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes r12
+%define tmp4 r12
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+%endif
+
+; we clobber rsi, rdi, rbp, r12; called routine clobbers also r9-r15
+struc STACK
+_gpr_save: resq 7
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_avx512,function,internal)
+submit_job_hmac_avx512:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*5], rsi
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax
+ DBGPRINTL "---------- enter sha1 submit -----------"
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ DBGPRINTL64 "lane", lane
+ DBGPRINTL64 "unused_lanes", unused_lanes
+
+ add dword [state + _num_lanes_inuse_sha1], 1
+
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ DBGPRINTL64 "last_len", last_len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ DBGPRINTL64 "extra_blocks", extra_blocks
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ vmovdqu32 zmm0, [p - 64 + len]
+ vmovdqu32 [lane_data + _extra_block], zmm0
+
+end_fast_copy:
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ DBGPRINTL64 "lt64_bytes extra_blocks", extra_blocks
+ DBGPRINTL64 "lt64_bytes start_offset", start_offset
+ mov [state + _lens + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1]
+ cmp num_lanes_inuse, 0x10 ; all 16 lanes used?
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ DBGPRINTL64 "min_length", len2
+ DBGPRINTL64 "min_length index ", idx
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "SUBMIT lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens + 0*2], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens + 8*2], xmm2
+ DBGPRINTL_XMM "lengths after subtraction (0..7)", xmm0
+ DBGPRINTL_XMM "lengths after subtraction (8..F)", xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov qword [lane_data + _job_in_lane], 0
+
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha1], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqu64 [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+ DBGPRINTL "---------- exit sha1 submit -----------"
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*5]
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save]
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm
new file mode 100644
index 000000000..d67046ce5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm
@@ -0,0 +1,439 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX R8 R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RBX RCX RBP RSI RDI
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RDX RSI R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RBX RCX RBP RDI R8
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+align 64
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define state arg1
+%define SIZE arg2
+%define IDX arg3
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define KT zmm5
+%define AA zmm6
+%define BB zmm7
+%define CC zmm8
+%define DD zmm9
+%define EE zmm10
+%define TMP0 zmm11
+%define TMP1 zmm12
+%define TMP2 zmm13
+%define TMP3 zmm14
+%define TMP4 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 TMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold TMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, TMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+
+; Note this is reading in two blocks of data from each lane,
+; in preparation for the upcoming needed transpose to build msg schedule.
+; Each register will contain 32 bytes from one lane plus 32 bytes
+; from another lane.
+; The first 8 registers will contain the first 32 bytes of all lanes,
+; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half
+; and 0-31 bytes from lane X+8 in the second half.
+; The last 8 registers will contain the last 32 bytes of all lanes,
+; where register Y (8 <= Y <= 15) wil contain bytes 32-63 from lane Y-8 in the first half
+; and 32-63 bytes from lane Y in the second half.
+; This method helps reducing the number of shuffles required to transpose the data.
+%macro MSG_SCHED_ROUND_00_15 6
+%define %%Wt %1 ; [out] zmm register to load the next block
+%define %%LANE_IDX %2 ; [in] lane index (0-15)
+%define %%BASE_PTR %3 ; [in] base address of the input data
+%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane
+%define %%TMP1 %5 ; [clobbered] temporary gp register
+%define %%TMP2 %6 ; [clobbered] temporary gp register
+%if (%%LANE_IDX < 8)
+ mov %%TMP1, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR], 0x01
+%else
+ mov %%TMP1, [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR+32], 0x01
+%endif
+%endmacro
+
+align 64
+; void sha1_mult_x16_avx3(void **input_data, UINT128 *digest, UINT32 size)
+; arg 1 : pointer to SHA1 args structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_x16_avx512,function,internal)
+sha1_x16_avx512:
+ ;; Initialize digests
+ vmovdqu32 A, [state + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [state + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [state + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [state + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [state + 4*SHA1_DIGEST_ROW_SIZE]
+ DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed digest", A, B, C, D, E
+ DBGPRINTL64 "SIZE", SIZE
+
+ xor IDX, IDX
+
+ ;; Load first blocks of data into ZMM registers before
+ ;; performing a 16x16 32-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+8.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 +10*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 +11*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 +12*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 +13*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 +14*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 +15*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+lloop:
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ add IDX, 64
+
+ TRANSPOSE16_U32 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP3, TMP4
+ DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed input", W0, W1, W2, W3, W4, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; Save digests for later addition
+ vmovdqa32 AA, A
+ vmovdqa32 BB, B
+ vmovdqa32 CC, C
+ vmovdqa32 DD, D
+ vmovdqa32 EE, E
+
+ vmovdqa32 KT, [rel K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %if N = 19
+ vmovdqa32 KT, [rel K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [rel K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [rel K60_79]
+ %assign I 0x96
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Check if this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J, state + _data_ptr_sha1, IDX, inp0, inp1
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+ ; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovdqu32 [state + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqu32 [state + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqu32 [state + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqu32 [state + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqu32 [state + 4*SHA1_DIGEST_ROW_SIZE], E
+ DBGPRINTL_ZMM "Sha1-AVX512 outgoing transposed digest", A, B, C, D, E
+
+ ;; update input pointers
+ mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state + _data_ptr_sha1 + 0*PTR_SZ], inp0
+ mov [state + _data_ptr_sha1 + 1*PTR_SZ], inp1
+ mov [state + _data_ptr_sha1 + 2*PTR_SZ], inp2
+ mov [state + _data_ptr_sha1 + 3*PTR_SZ], inp3
+ mov [state + _data_ptr_sha1 + 4*PTR_SZ], inp4
+ mov [state + _data_ptr_sha1 + 5*PTR_SZ], inp5
+ mov [state + _data_ptr_sha1 + 6*PTR_SZ], inp6
+ mov [state + _data_ptr_sha1 + 7*PTR_SZ], inp7
+
+ mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 10*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 11*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 12*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 13*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 14*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 15*PTR_SZ]
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state + _data_ptr_sha1 + 8*PTR_SZ], inp0
+ mov [state + _data_ptr_sha1 + 9*PTR_SZ], inp1
+ mov [state + _data_ptr_sha1 + 10*PTR_SZ], inp2
+ mov [state + _data_ptr_sha1 + 11*PTR_SZ], inp3
+ mov [state + _data_ptr_sha1 + 12*PTR_SZ], inp4
+ mov [state + _data_ptr_sha1 + 13*PTR_SZ], inp5
+ mov [state + _data_ptr_sha1 + 14*PTR_SZ], inp6
+ mov [state + _data_ptr_sha1 + 15*PTR_SZ], inp7
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm
new file mode 100644
index 000000000..cdbb61ea3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm
@@ -0,0 +1,758 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX RSI RDI R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RCX
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RDI
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+%include "include/reg_sizes.asm"
+
+; re-use K256 from sha256_oct_avx2.asm
+extern K256
+
+;; code to compute x16 SHA256 using AVX512
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifdef LINUX
+; Linux register definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define arg3 rcx
+ %define arg4 rdx
+%else
+; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define arg3 rsi
+ %define arg4 rdi
+%endif
+
+%define STATE arg1
+%define INP_SIZE arg2
+%define IDX arg3
+%define TBL arg4
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT %1
+%define %%ROUND %2
+%define %%WTp1 %3
+%define %%WTp9 %4
+%define %%WTp14 %5
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ ;; For next value in msg schedule
+ ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, H, %%WT ; T1 = H + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP6 ; T1 = T1 + Kt
+ vprord H, A, 2 ; ROR_2(A)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vmovdqa32 TMP0, A
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpaddd D, D, T1 ; D = D + T1
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+
+ vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in two blocks of data from each lane,
+; in preparation for the upcoming needed transpose to build msg schedule.
+; Each register will contain 32 bytes from one lane plus 32 bytes
+; from another lane.
+; The first 8 registers will contain the first 32 bytes of all lanes,
+; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half
+; and 0-31 bytes from lane X+8 in the second half.
+; The last 8 registers will contain the last 32 bytes of all lanes,
+; where register Y (8 <= Y <= 15) wil contain bytes 32-63 from lane Y-8 in the first half
+; and 32-63 bytes from lane Y in the second half.
+; This method helps reducing the number of shuffles required to transpose the data.
+%macro MSG_SCHED_ROUND_00_15 6
+%define %%Wt %1 ; [out] zmm register to load the next block
+%define %%LANE_IDX %2 ; [in] lane index (0-15)
+%define %%BASE_PTR %3 ; [in] base address of the input data
+%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane
+%define %%TMP1 %5 ; [clobbered] temporary gp register
+%define %%TMP2 %6 ; [clobbered] temporary gp register
+%if (%%LANE_IDX < 8)
+ mov %%TMP1, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR], 0x01
+%else
+ mov %%TMP1, [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR+32], 0x01
+%endif
+%endmacro
+
+ section .data
+default rel
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+;; void sha256_x16_avx512(void **input_data, UINT128 *digest[16], UINT64 size)
+;; arg 1 : pointer to SHA256 args structure
+;; arg 2 : size (in blocks) ;; assumed to be >= 1
+;; arg 1 : rcx : pointer to array of pointers to input data
+;; arg 2 : rdx : pointer to array of pointers to digest
+;; arg 3 : r8 : size of input in bytes
+MKGLOBAL(sha256_x16_avx512,function,internal)
+align 64
+sha256_x16_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+
+ ;; Initialize digests
+ vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL, [rel TABLE]
+
+ ; Do we need to transpose digests???
+ ; SHA1 does not, but SHA256 has been
+
+ xor IDX, IDX
+
+ ;; Load first blocks of data into ZMM registers before
+ ;; performing a 16x16 32-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+8.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ mov inp0, [STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha256 + 3*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha256 + 4*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha256 + 5*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha256 + 6*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha256 + 7*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ mov inp0, [STATE + _data_ptr_sha256 + 8*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha256 + 9*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha256 +10*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha256 +11*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha256 +12*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha256 +13*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha256 +14*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha256 +15*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ align 32
+lloop:
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ TRANSPOSE16_U32 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP4, TMP5
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ; Check is this is the last block
+ sub INP_SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J, STATE + _data_ptr_sha256, IDX, inp0, inp1
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A
+ vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B
+ vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C
+ vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D
+ vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E
+ vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F
+ vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G
+ vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H
+
+ ; update input pointers
+%assign I 0
+%rep 16
+ add [STATE + _data_ptr_sha256 + I*PTR_SZ], IDX
+%assign I (I+1)
+%endrep
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame (8*64 bytes)
+ vpxorq zmm0, zmm0
+%assign i 0
+%rep 8
+ vmovdqa64 [rsp + i*64], zmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm
new file mode 100644
index 000000000..48532c3fb
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm
@@ -0,0 +1,595 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RBX RCX RBP RSI
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RDX RSI R8 R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RBX RCX RBP RDI
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+;; code to compute quad SHA512 using AVX512
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+; Linux register definitions
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rsi
+%define arg4 rdi
+%endif
+
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX arg4
+%define TBL r8
+
+;; retaining XMM_SAVE, because the top half of YMM registers no saving required, only bottom half, the XMM part
+%define NUM_LANES 8
+%define XMM_SAVE (15-5)*16
+%define SZ 8
+%define SZ8 8 * SZ
+%define DIGEST_SZ 8 * SZ8
+%define DIGEST_SAVE NUM_LANES * DIGEST_SZ
+%define RSP_SAVE 1*8
+
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64
+FIELD _XMM_SAVE, XMM_SAVE, 16
+FIELD _RSP, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28 ; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14 ; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+%define SMALL_SIGMA_0_0 1 ; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19 ; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;; equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddq T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ ;; compute BIG_SIGMA_1(E)
+ vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E)
+ vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E)
+ vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E)
+ vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E)
+ vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddq T1, T1, %%WT ; T1 = T1 + Wt
+ vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E)
+ vpaddq D, D, T1 ; D = D + T1
+ vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A)
+ vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A)
+ vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A)
+ vmovdqa32 TMP0, A
+ vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A)
+ vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+ vpaddq H, H, T1 ; H(A) = H(T2) + T1
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2)
+ vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2)
+ vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2)
+ vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+ vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15)
+ vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15)
+ vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) +
+ ; Wt-7 + sigma_0(Wt-15) +
+%endmacro
+
+section .data
+default rel
+
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK:
+ ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ ;ddq 0x18191a1b1c1d1e1f1011121314151617
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+ ;ddq 0x28292a2b2c2d2e2f2021222324252627
+ dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+ ;ddq 0x38393a3b3c3d3e3f3031323334353637
+ dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+section .text
+
+;; void sha512_x8_avx512(void *input_data, UINT64 *digest[NUM_LANES], const int size)
+;; arg 1 : rcx : pointer to input data
+;; arg 2 : rdx : pointer to UINT64 digest[8][num_lanes]
+;; arg 3 : size in message block lengths (= 128 bytes)
+MKGLOBAL(sha512_x8_avx512,function,internal)
+align 64
+sha512_x8_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _RSP], rax
+
+ ;; Initialize digests ; organized uint64 digest[8][num_lanes]; no transpose required
+ ;; Digest is an array of pointers to digests
+ vmovdqu32 A, [STATE + 0*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [STATE + 1*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [STATE + 2*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [STATE + 3*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [STATE + 4*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 F, [STATE + 5*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 G, [STATE + 6*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 H, [STATE + 7*SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel TABLE]
+ xor IDX, IDX
+ ;; Read in input data address, saving them in registers because
+ ;; they will serve as variables, which we shall keep incrementing
+ mov inp0, [STATE + _data_ptr_sha512 + 0*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha512 + 1*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha512 + 2*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha512 + 3*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha512 + 4*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha512 + 5*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha512 + 6*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha512 + 7*PTR_SZ]
+ jmp lloop
+
+align 32
+lloop:
+ ;; Load 64-byte blocks of data into ZMM registers before
+ ;; performing a 8x8 64-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+4.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ TRANSPOSE8_U64_LOAD8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ TRANSPOSE8_U64 W0, W1, W2, W3, W4, W5, W6, W7, TMP0, TMP1, TMP2, TMP3
+ ;; Load next 512 bytes
+ TRANSPOSE8_U64_LOAD8 W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX+SZ8
+
+ TRANSPOSE8_U64 W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP2, TMP3
+
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 128 ; increment by message block length in bytes
+
+%assign I 0
+%rep 16
+;;; little endian to big endian
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first (max-rounds -16)
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+ ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+ ; Check is this is the last block
+ sub INP_SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+align 32
+lastLoop:
+ ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ; Write out digest
+ ;; results in A, B, C, D, E, F, G, H
+ vmovdqu32 [STATE + 0*SHA512_DIGEST_ROW_SIZE], A
+ vmovdqu32 [STATE + 1*SHA512_DIGEST_ROW_SIZE], B
+ vmovdqu32 [STATE + 2*SHA512_DIGEST_ROW_SIZE], C
+ vmovdqu32 [STATE + 3*SHA512_DIGEST_ROW_SIZE], D
+ vmovdqu32 [STATE + 4*SHA512_DIGEST_ROW_SIZE], E
+ vmovdqu32 [STATE + 5*SHA512_DIGEST_ROW_SIZE], F
+ vmovdqu32 [STATE + 6*SHA512_DIGEST_ROW_SIZE], G
+ vmovdqu32 [STATE + 7*SHA512_DIGEST_ROW_SIZE], H
+
+ ; update input pointers
+%assign I 0
+%rep 8
+ add [STATE + _data_ptr_sha512 + I*PTR_SZ], IDX
+%assign I (I+1)
+%endrep
+
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame ((NUM_LANES*8)*64 bytes)
+ vpxorq zmm0, zmm0
+%assign i 0
+%rep (NUM_LANES*8)
+ vmovdqa64 [rsp + i*64], zmm0
+%assign i (i+1)
+%endrep
+%endif
+ mov rsp, [rsp + _RSP]
+;hash_done:
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif