summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm727
1 files changed, 727 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
new file mode 100644
index 000000000..c4b1dd561
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
@@ -0,0 +1,727 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routines to do 128/192/256 bit CBC AES encrypt
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+struc STACK
+_gpr_save: resq 3
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IA0 rax
+%define IA1 rbx
+%define IA2 arg3
+%define IA3 arg4
+%define IA4 rbp
+%define IA5 r8
+%define IA6 r9
+%define IA7 r10
+%define IA8 r11
+%define IA9 r13
+%define IA10 r14
+%define IA11 r15
+%define IA12 r12
+
+%define ZIV00_03 zmm8
+%define ZIV04_07 zmm9
+%define ZIV08_11 zmm10
+%define ZIV12_15 zmm11
+
+%define ZT0 zmm16
+%define ZT1 zmm17
+%define ZT2 zmm18
+%define ZT3 zmm19
+%define ZT4 zmm20
+%define ZT5 zmm21
+%define ZT6 zmm22
+%define ZT7 zmm23
+%define ZT8 zmm24
+%define ZT9 zmm25
+%define ZT10 zmm26
+%define ZT11 zmm27
+%define ZT12 zmm28
+%define ZT13 zmm29
+%define ZT14 zmm30
+%define ZT15 zmm31
+
+%define ZT16 zmm12
+%define ZT17 zmm13
+%define ZT18 zmm14
+%define ZT19 zmm15
+
+%define TAB_A0B0A1B1 zmm6
+%define TAB_A2B2A3B3 zmm7
+
+;; Save registers states
+%macro FUNC_SAVE 0
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+%ifndef LINUX
+ mov [GPR_SAVE_AREA + 8*1], rsi
+ mov [GPR_SAVE_AREA + 8*2], rdi
+%endif
+%endmacro
+
+;; Restore register states
+%macro FUNC_RESTORE 0
+ ;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+%ifndef LINUX
+ mov rsi, [GPR_SAVE_AREA + 8*1]
+ mov rdi, [GPR_SAVE_AREA + 8*2]
+%endif
+ add rsp, STACK_size
+ vzeroupper
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Transpose macro - executes 4x4 transpose of 4 ZMM registers
+; in: L0B0-3 out: B0L0-3
+; L1B0-3 B1L0-3
+; L2B0-3 B2L0-3
+; L3B0-3 B3L0-3
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro TRANSPOSE_4x4 8
+%define %%IN_OUT_0 %1
+%define %%IN_OUT_1 %2
+%define %%IN_OUT_2 %3
+%define %%IN_OUT_3 %4
+%define %%ZTMP_0 %5
+%define %%ZTMP_1 %6
+%define %%ZTMP_2 %7
+%define %%ZTMP_3 %8
+
+ vmovdqa64 %%ZTMP_0, TAB_A0B0A1B1
+ vmovdqa64 %%ZTMP_1, %%ZTMP_0
+ vmovdqa64 %%ZTMP_2, TAB_A2B2A3B3
+ vmovdqa64 %%ZTMP_3, %%ZTMP_2
+
+ vpermi2q %%ZTMP_0, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_1, %%IN_OUT_2, %%IN_OUT_3
+ vpermi2q %%ZTMP_2, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_3, %%IN_OUT_2, %%IN_OUT_3
+
+ vshufi64x2 %%IN_OUT_0, %%ZTMP_0, %%ZTMP_1, 0x44
+ vshufi64x2 %%IN_OUT_2, %%ZTMP_2, %%ZTMP_3, 0x44
+ vshufi64x2 %%IN_OUT_1, %%ZTMP_0, %%ZTMP_1, 0xee
+ vshufi64x2 %%IN_OUT_3, %%ZTMP_2, %%ZTMP_3, 0xee
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LOAD_STORE - loads/stores 1-4 blocks (16 bytes) for 4 lanes into ZMM registers
+; - Loads 4 blocks by default
+; - Pass %%MASK_REG argument to load/store 1-3 blocks (optional)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro LOAD_STORE_x4 15-16
+%define %%LANE_A %1 ; [in] lane index to load/store (numerical)
+%define %%LANE_B %2 ; [in] lane index to load/store (numerical)
+%define %%LANE_C %3 ; [in] lane index to load/store (numerical)
+%define %%LANE_D %4 ; [in] lane index to load/store (numerical)
+%define %%DATA_PTR %5 ; [in] GP reg with ptr to lane input table
+%define %%OFFSET %6 ; [in] GP reg input/output buffer offset
+%define %%ZDATA0 %7 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA1 %8 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA2 %9 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA3 %10 ; [in/out] ZMM reg to load/store data
+%define %%GP0 %11 ; [clobbered] tmp GP reg
+%define %%GP1 %12 ; [clobbered] tmp GP reg
+%define %%GP2 %13 ; [clobbered] tmp GP reg
+%define %%GP3 %14 ; [clobbered] tmp GP reg
+%define %%LOAD_STORE %15 ; [in] string value to select LOAD or STORE
+%define %%MASK_REG %16 ; [in] mask reg used for load/store mask
+%define %%NUM_ARGS %0
+
+ mov %%GP0, [%%DATA_PTR + 8*(%%LANE_A)]
+ mov %%GP1, [%%DATA_PTR + 8*(%%LANE_B)]
+ mov %%GP2, [%%DATA_PTR + 8*(%%LANE_C)]
+ mov %%GP3, [%%DATA_PTR + 8*(%%LANE_D)]
+
+%if %%NUM_ARGS <= 15 ;; %%MASK_REG not set, assume 4 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3, [%%GP3 + %%OFFSET]
+%else ; STORE8
+ vmovdqu8 [%%GP0 + %%OFFSET], %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET], %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET], %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET], %%ZDATA3
+%endif
+%else ;; %%MASK_REG argument passed - 1, 2, or 3 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0{%%MASK_REG}{z}, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1{%%MASK_REG}{z}, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2{%%MASK_REG}{z}, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3{%%MASK_REG}{z}, [%%GP3 + %%OFFSET]
+%else ; STORE
+ vmovdqu8 [%%GP0 + %%OFFSET]{%%MASK_REG}, %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET]{%%MASK_REG}, %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET]{%%MASK_REG}, %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET]{%%MASK_REG}, %%ZDATA3
+%endif
+%endif ;; %%NUM_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; AESENC_ROUNDS_x16 macro
+; - 16 lanes, 1 block per lane
+; - it handles special cases: the last and zero rounds
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AESENC_ROUNDS_x16 5
+%define %%L00_03 %1 ; [in/out] ZMM with lane 0-3 blocks
+%define %%L04_07 %2 ; [in/out] ZMM with lane 4-7 blocks
+%define %%L08_11 %3 ; [in/out] ZMM with lane 8-11 blocks
+%define %%L12_15 %4 ; [in/out] ZMM with lane 12-15 blocks
+%define %%NROUNDS %5 ; [in] number of aes rounds
+
+%define %%KP ARG + _aesarg_key_tab
+%define K00_03_OFFSET 0
+%define K04_07_OFFSET 64
+%define K08_11_OFFSET 128
+%define K12_15_OFFSET 192
+
+%assign ROUND 0
+%rep (%%NROUNDS + 2)
+
+%if ROUND < 1
+ ;; XOR with key 0 before doing aesenc
+ vpxorq %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vpxorq %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vpxorq %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vpxorq %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+%if ROUND <= %%NROUNDS
+
+ ;; rounds 1 to 9/11/13
+ vaesenc %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenc %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenc %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenc %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+ ;; the last round
+ vaesenclast %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%endif
+%endif
+
+%assign ROUND (ROUND + 1)
+%endrep
+
+%endmacro ; AESENC_ROUNDS_x16
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_PARALLEL - Encode all blocks up to multiple of 4
+; - Operation
+; - loop encrypting %%LENGTH bytes of input data
+; - each loop encrypts 4 blocks across 16 lanes
+; - stop when %%LENGTH is less than 64 bytes (4 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_PARALLEL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %7 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %10 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %14 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %18 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %22 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %26 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %27 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %28 ;; [clobbered] tmp GP register
+%define %%TMP1 %29 ;; [clobbered] tmp GP register
+%define %%TMP2 %30 ;; [clobbered] tmp GP register
+%define %%TMP3 %31 ;; [clobbered] tmp GP register
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+ ;; check for at least 4 blocks
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_done
+
+ xor %%IDX, %%IDX
+ ;; skip length check on first loop
+ jmp %%encrypt_16_first
+
+%%encrypt_16_start:
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_end
+
+%%encrypt_16_first:
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor first plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B3L00_03, %%B2L00_03
+ vpxorq %%B3L04_07, %%B2L04_07
+ vpxorq %%B3L08_11, %%B2L08_11
+ vpxorq %%B3L12_15, %%B2L12_15
+
+ ;; encrypt block 3 lanes
+ AESENC_ROUNDS_x16 %%B3L00_03, %%B3L04_07, %%B3L08_11, %%B3L12_15, %%NROUNDS
+
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B3L00_03
+ vmovdqa64 %%ZIV04_07, %%B3L04_07
+ vmovdqa64 %%ZIV08_11, %%B3L08_11
+ vmovdqa64 %%ZIV12_15, %%B3L12_15
+
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ sub %%LENGTH, 64
+ add %%IDX, 64
+ jmp %%encrypt_16_start
+
+%%encrypt_16_end:
+ ;; update in/out pointers
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%%encrypt_16_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_FINAL Encodes final blocks (less than 4) across 16 lanes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_FINAL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%NROUNDS %5 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %6 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %7 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %10 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %14 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %18 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %22 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %26 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %27 ;; [clobbered] tmp GP register
+%define %%TMP1 %28 ;; [clobbered] tmp GP register
+%define %%TMP2 %29 ;; [clobbered] tmp GP register
+%define %%TMP3 %30 ;; [clobbered] tmp GP register
+%define %%NUM_BLKS %31 ;; [in] number of blocks (numerical value)
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+%if %%NUM_BLKS == 1
+ mov %%TMP0, 0x0000_0000_0000_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 2
+ mov %%TMP0, 0x0000_0000_ffff_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 3
+ mov %%TMP0, 0x0000_ffff_ffff_ffff
+ kmovq k1, %%TMP0
+%endif
+ xor %%IDX, %%IDX
+
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+%if %%NUM_BLKS == 1
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B0L00_03
+ vmovdqa64 %%ZIV04_07, %%B0L04_07
+ vmovdqa64 %%ZIV08_11, %%B0L08_11
+ vmovdqa64 %%ZIV12_15, %%B0L12_15
+%endif
+
+%if %%NUM_BLKS >= 2
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 2
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B1L00_03
+ vmovdqa64 %%ZIV04_07, %%B1L04_07
+ vmovdqa64 %%ZIV08_11, %%B1L08_11
+ vmovdqa64 %%ZIV12_15, %%B1L12_15
+%endif
+
+%if %%NUM_BLKS >= 3
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 3
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B2L00_03
+ vmovdqa64 %%ZIV04_07, %%B2L04_07
+ vmovdqa64 %%ZIV08_11, %%B2L08_11
+ vmovdqa64 %%ZIV12_15, %%B2L12_15
+%endif
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; update in/out pointers
+ mov %%IDX, %%NUM_BLKS
+ shl %%IDX, 4
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CBC_ENC Encodes given data.
+; Requires the input data be at least 1 block (16 bytes) long
+; Input: Number of AES rounds
+;
+; First encrypts block up to multiple of 4
+; Then encrypts final blocks (less than 4)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CBC_ENC 1
+%define %%ROUNDS %1
+
+ ;; load transpose tables
+ vmovdqa64 TAB_A0B0A1B1, [rel A0B0A1B1]
+ vmovdqa64 TAB_A2B2A3B3, [rel A2B2A3B3]
+
+ ;; load IV's per lane
+ vmovdqa64 ZIV00_03, [ARG + _aesarg_IV + 16*0]
+ vmovdqa64 ZIV04_07, [ARG + _aesarg_IV + 16*4]
+ vmovdqa64 ZIV08_11, [ARG + _aesarg_IV + 16*8]
+ vmovdqa64 ZIV12_15, [ARG + _aesarg_IV + 16*12]
+
+ ENCRYPT_16_PARALLEL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ LEN, %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, \
+ ZT6, ZT7, ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, \
+ ZT15, ZT16, ZT17, ZT18, ZT19, IA2, IA3, IA4, IA5
+
+ ;; get num remaining blocks
+ shr LEN, 4
+ and LEN, 3
+ je %%_cbc_enc_done
+ cmp LEN, 1
+ je %%_final_blocks_1
+ cmp LEN, 2
+ je %%_final_blocks_2
+
+%%_final_blocks_3:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 3
+ jmp %%_cbc_enc_done
+%%_final_blocks_1:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 1
+ jmp %%_cbc_enc_done
+%%_final_blocks_2:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 2
+%%_cbc_enc_done:
+ ;; store IV's per lane
+ vmovdqa64 [ARG + _aesarg_IV + 16*0], ZIV00_03
+ vmovdqa64 [ARG + _aesarg_IV + 16*4], ZIV04_07
+ vmovdqa64 [ARG + _aesarg_IV + 16*8], ZIV08_11
+ vmovdqa64 [ARG + _aesarg_IV + 16*12], ZIV12_15
+%endmacro
+
+
+section .data
+;;;;;;;;;;;;;;;;;;
+; Transpose tables
+;;;;;;;;;;;;;;;;;;
+default rel
+
+align 64
+A0B0A1B1:
+ dq 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb
+
+align 64
+A2B2A3B3:
+ dq 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, 0xf
+
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_128_vaes_avx512,function,internal)
+aes_cbc_enc_128_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 9
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_192_vaes_avx512,function,internal)
+aes_cbc_enc_192_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 11
+ FUNC_RESTORE
+ ret
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_256_vaes_avx512,function,internal)
+aes_cbc_enc_256_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 13
+ FUNC_RESTORE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif