summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm537
1 files changed, 537 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
new file mode 100644
index 000000000..9d132ec5f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
@@ -0,0 +1,537 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%include "include/memcpy.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x8
+%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_mask:
+ dq 0xFFFFFFFFFFFFFFF0
+align 16
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ dq 0x0100010001000100, 0x0100010001000100
+counter_mask:
+ dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%define NROUNDS 9 ; AES-CCM-128
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+%define tmp4 rax
+%define auth_len_aad rax
+
+%define min_idx rbp
+%define flags rbp
+
+%define lane r8
+
+%define iv_len r9
+%define auth_len r9
+
+%define aad_len r10
+%define init_block_addr r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp r12
+%define tmp2 r13
+%define tmp3 r14
+
+%define good_lane r15
+%define min_job r15
+
+%define init_block0 xmm0
+%define ccm_lens xmm1
+%define min_len_idx xmm2
+%define xtmp0 xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+;;; ===========================================================================
+;;; AES CCM auth job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 15
+ shr unused_lanes, 4
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_ccm_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_ccm_args_keys + lane*8], tmp
+ ;; init_done = 0
+ mov word [state + _aes_ccm_init_done + lane*2], 0
+ lea tmp, [lane * 8]
+
+ vpxor init_block0, init_block0
+ vmovdqa [state + _aes_ccm_args_IV + tmp*2], init_block0
+
+ ;; Prepare initial Block 0 for CBC-MAC-128
+
+ ;; Byte 0: flags with L' and M' (AAD later)
+ ;; Calculate L' = 15 - IV length - 1 = 14 - IV length
+ mov flags, 14
+ mov iv_len, [job + _iv_len_in_bytes]
+ sub flags, iv_len
+ ;; Calculate M' = (Digest length - 2) / 2
+ mov tmp, [job + _auth_tag_output_len_in_bytes]
+ sub tmp, 2
+
+ shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl)
+ or flags, tmp
+
+ ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+ ;; Bytes 1 - 7 are always copied (first 7 bytes)
+ mov tmp, [job + _iv]
+ vpinsrb init_block0, [tmp], 1
+ vpinsrw init_block0, [tmp + 1], 1
+ vpinsrd init_block0, [tmp + 3], 1
+
+ cmp iv_len, 7
+ je %%_finish_nonce_move
+
+ cmp iv_len, 8
+ je %%_iv_length_8
+ cmp iv_len, 9
+ je %%_iv_length_9
+ cmp iv_len, 10
+ je %%_iv_length_10
+ cmp iv_len, 11
+ je %%_iv_length_11
+ cmp iv_len, 12
+ je %%_iv_length_12
+
+ ;; Bytes 8 - 13
+%%_iv_length_13:
+ vpinsrb init_block0, [tmp + 12], 13
+%%_iv_length_12:
+ vpinsrb init_block0, [tmp + 11], 12
+%%_iv_length_11:
+ vpinsrd init_block0, [tmp + 7], 2
+ jmp %%_finish_nonce_move
+%%_iv_length_10:
+ vpinsrb init_block0, [tmp + 9], 10
+%%_iv_length_9:
+ vpinsrb init_block0, [tmp + 8], 9
+%%_iv_length_8:
+ vpinsrb init_block0, [tmp + 7], 8
+
+%%_finish_nonce_move:
+
+ ;; Bytes 14 & 15 (message length), in Big Endian
+ mov ax, [job + _msg_len_to_hash_in_bytes]
+ xchg al, ah
+ vpinsrw init_block0, ax, 7
+
+ mov aad_len, [job + _cbcmac_aad_len]
+ ;; Initial length to authenticate (Block 0)
+ mov auth_len, 16
+ ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded,
+ ;; so length is multiple of 64B)
+ lea auth_len_aad, [aad_len + (2 + 15) + 16]
+ and auth_len_aad, -16
+
+ or aad_len, aad_len
+ cmovne auth_len, auth_len_aad
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ vphminposuw min_len_idx, ccm_lens
+
+ mov tmp, lane
+ shl tmp, 6
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp]
+ or aad_len, aad_len
+ je %%_aad_complete
+
+ or flags, (1 << 6) ; Set Adata bit in flags
+
+ ;; Copy AAD
+ ;; Set all 0s in last block (padding)
+ lea tmp, [init_block_addr + auth_len]
+ sub tmp, 16
+ vpxor xtmp0, xtmp0
+ vmovdqa [tmp], xtmp0
+
+ ;; Start copying from second block
+ lea tmp, [init_block_addr+16]
+ mov rax, aad_len
+ xchg al, ah
+ mov [tmp], ax
+ add tmp, 2
+ mov tmp2, [job + _cbcmac_aad]
+ memcpy_avx_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3
+
+%%_aad_complete:
+
+ ;; Finish Block 0 with Byte 0
+ vpinsrb init_block0, BYTE(flags), 0
+ vmovdqa [init_block_addr], init_block0
+
+ mov [state + _aes_ccm_args_in + lane * 8], init_block_addr
+
+ cmp byte [state + _aes_ccm_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 35
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp QWORD [state + _aes_ccm_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_ccm_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_ccm_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_ccm_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_ccm_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; Copy good_lane to empty lanes
+ movzx tmp, word [state + _aes_ccm_init_done + good_lane*2]
+ mov tmp2, [state + _aes_ccm_args_in + good_lane*8]
+ mov tmp3, [state + _aes_ccm_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xtmp0, [state + _aes_ccm_args_IV + good_lane]
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ vpor ccm_lens, [rel len_masks + 16*I]
+ mov [state + _aes_ccm_init_done + I*2], WORD(tmp)
+ mov [state + _aes_ccm_args_in + I*8], tmp2
+ mov [state + _aes_ccm_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ ;; Find min length
+ vphminposuw min_len_idx, ccm_lens
+
+%endif ; end FLUSH
+
+%%_ccm_round:
+ vpextrw len2, min_len_idx, 0 ; min value
+ vpextrw min_idx, min_len_idx, 1 ; min index (0...7)
+
+ mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8]
+
+ or len2, len2
+ je %%_len_is_0
+ ;; subtract min length from all lengths
+ vpshufb min_len_idx, min_len_idx, [rel dupw] ; broadcast min length
+ vpsubw ccm_lens, min_len_idx
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and min_idx are intact
+
+%%_len_is_0:
+
+ movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2]
+ cmp WORD(tmp), 0
+ je %%_prepare_full_blocks_to_auth
+ cmp WORD(tmp), 1
+ je %%_prepare_partial_block_to_auth
+
+%%_encrypt_digest:
+
+ ;; Set counter block 0 (reusing previous initial block 0)
+ mov tmp, min_idx
+ shl tmp, 3
+ vmovdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8]
+
+ vpand init_block0, [rel counter_mask]
+
+ mov tmp2, [state + _aes_ccm_args_keys + tmp]
+ ENCRYPT_SINGLE_BLOCK tmp2, init_block0
+ vpxor init_block0, [state + _aes_ccm_args_IV + tmp * 2]
+
+ ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16)
+ mov min_job, [state + _aes_ccm_job_in_lane + tmp]
+ mov tmp3, [min_job + _auth_tag_output_len_in_bytes]
+ mov tmp2, [min_job + _auth_tag_output]
+
+ simd_store_avx tmp2, init_block0, tmp3, tmp, tmp4
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, min_idx
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, min_job
+
+ mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ vpxor xtmp0, xtmp0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ shl min_idx, 3
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job
+ vmovdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + min_idx], 0
+%else
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD
+ ;; of returned job and "NULL lanes"
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + I*8], 0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SUBMIT
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%%_prepare_full_blocks_to_auth:
+
+ cmp dword [min_job + _cipher_direction], 2 ; DECRYPT
+ je %%_decrypt
+
+%%_encrypt:
+ mov tmp, [min_job + _src]
+ add tmp, [min_job + _hash_start_src_offset_in_bytes]
+ jmp %%_set_init_done_1
+
+%%_decrypt:
+ mov tmp, [min_job + _dst]
+
+%%_set_init_done_1:
+ mov [state + _aes_ccm_args_in + min_idx*8], tmp
+ mov word [state + _aes_ccm_init_done + min_idx*2], 1
+
+ ; Check if there are full blocks to hash
+ mov tmp, [min_job + _msg_len_to_hash_in_bytes]
+ and tmp, -16
+ je %%_prepare_partial_block_to_auth
+
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ jmp %%_ccm_round
+
+%%_prepare_partial_block_to_auth:
+ ; Check if partial block needs to be hashed
+ mov auth_len, [min_job + _msg_len_to_hash_in_bytes]
+ and auth_len, 15
+ je %%_encrypt_digest
+
+ mov word [state + _aes_ccm_init_done + min_idx * 2], 2
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ mov tmp2, min_idx
+ shl tmp2, 6
+ add tmp2, 16 ; pb[AES_BLOCK_SIZE]
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2]
+ mov tmp2, [state + _aes_ccm_args_in + min_idx * 8]
+
+ simd_load_avx_15_1 xtmp0, tmp2, auth_len
+
+%%_finish_partial_block_copy:
+ vmovdqa [init_block_addr], xtmp0
+ mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr
+
+ jmp %%_ccm_round
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal)
+SUBMIT_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal)
+FLUSH_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif