summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm502
1 files changed, 502 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
new file mode 100644
index 000000000..01c6315bd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
@@ -0,0 +1,502 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x4
+%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
+%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+%define m_last r10
+%define n r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp3 r12
+%define tmp4 r13
+%define tmp2 r14
+
+%define good_lane r15
+%define rbits r15
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; AES CMAC job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_cmac_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _key_expanded]
+ mov [state + _aes_cmac_args_keys + lane*8], tmp
+ mov tmp, lane
+ shl tmp, 4 ; lane*16
+
+ ;; Zero IV to store digest
+ pxor xmm0, xmm0
+ movdqa [state + _aes_cmac_args_IV + tmp], xmm0
+
+ lea m_last, [state + _aes_cmac_scratch + tmp]
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CMAC)
+ mov len, [job + _msg_len_to_hash_in_bits]
+ mov rbits, len
+ add len, 7 ; inc len if there are remainder bits
+ shr len, 3
+ and rbits, 7
+
+ ;; Check at least 1 or more blocks (get n)
+ mov n, len
+ add n, 0xf
+ shr n, 4
+
+ ;; Check for partial block
+ mov r, len
+ and r, 0xf
+
+ or n, n ; check one or more blocks?
+ jz %%_lt_one_block
+
+ ;; One or more blocks, potentially partial
+ mov word [state + _aes_cmac_init_done + lane*2], 0
+
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _aes_cmac_args_in + lane*8], tmp2
+
+ ;; len = (n-1)*16
+ lea tmp2, [n - 1]
+ shl tmp2, 4
+ movdqa xmm0, [state + _aes_cmac_lens]
+ XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
+ movdqa [state + _aes_cmac_lens], xmm0
+
+ ;; check remainder bits
+ or rbits, rbits
+ jnz %%_not_complete_block_3gpp
+
+ ;; check if complete block
+ or r, r
+ jz %%_complete_block
+
+%%_not_complete_block:
+ ;; M_last = padding(M_n) XOR K2
+ lea tmp, [rel padding_0x80_tab16 + 16]
+ sub tmp, r
+ movdqu xmm0, [tmp]
+ movdqa [m_last], xmm0
+
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ memcpy_sse_16 m_last, tmp, r, tmp4, iv
+
+ ;; src + n + r
+ mov tmp3, [job + _skey2]
+ movdqa xmm1, [m_last]
+ movdqu xmm0, [tmp3]
+ pxor xmm0, xmm1
+ movdqa [m_last], xmm0
+
+%%_step_5:
+ ;; Find min length
+ movdqa xmm0, [state + _aes_cmac_lens]
+ phminposuw xmm1, xmm0
+
+ cmp byte [state + _aes_cmac_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 19
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+
+ ; Copy good_lane to empty lanes
+ mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
+ mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ movdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
+ movdqa xmm0, [state + _aes_cmac_lens]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_cmac_args_in + I*8], tmp2
+ mov [state + _aes_cmac_args_keys + I*8], tmp3
+ movdqa [state + _aes_cmac_args_IV + I*16], xmm2
+ por xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ ;; Find min length
+ phminposuw xmm1, xmm0
+
+%endif ; end FLUSH
+
+%%_cmac_round:
+ pextrw len2, xmm1, 0 ; min value
+ pextrw idx, xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je %%_len_is_0
+ pshuflw xmm1, xmm1, 0
+ psubw xmm0, xmm1
+ movdqa [state + _aes_cmac_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and idx are intact
+
+ movdqa xmm0, [state + _aes_cmac_lens] ; preload lens
+%%_len_is_0:
+ ; Check if job complete
+ test word [state + _aes_cmac_init_done + idx*2], 0xffff
+ jnz %%_copy_complete_digest
+
+ ; Finish step 6
+ mov word [state + _aes_cmac_init_done + idx*2], 1
+
+ XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
+ movdqa [state + _aes_cmac_lens], xmm0
+
+ phminposuw xmm1, xmm0 ; find min length
+
+ mov tmp3, idx
+ shl tmp3, 4 ; idx*16
+ lea m_last, [state + _aes_cmac_scratch + tmp3]
+ mov [state + _aes_cmac_args_in + idx*8], m_last
+
+ jmp %%_cmac_round
+
+%%_copy_complete_digest:
+ ; Job complete, copy digest to AT output
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov tmp4, idx
+ shl tmp4, 4
+ lea tmp3, [state + _aes_cmac_args_IV + tmp4]
+ mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
+ mov tmp2, [job_rax + _auth_tag_output]
+
+ cmp tmp4, 16
+ jne %%_ne_16_copy
+
+ ;; 16 byte AT copy
+ movdqu xmm0, [tmp3]
+ movdqu [tmp2], xmm0
+ jmp %%_update_lanes
+
+%%_ne_16_copy:
+ memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
+
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ pxor xmm0, xmm0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ ;; Clear digest (in memory for IV) and scratch memory of returned job
+ movdqa [tmp3], xmm0
+
+ shl idx, 4
+ movdqa [state + _aes_cmac_scratch + idx], xmm0
+
+%else
+ ;; Clear digest and scratch memory of returned job and "NULL lanes"
+%assign I 0
+%rep 4
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ movdqa [state + _aes_cmac_args_IV + I*16], xmm0
+ movdqa [state + _aes_cmac_scratch + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif ;; SUBMIT
+
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+%%_complete_block:
+
+ ;; Block size aligned
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp2, tmp3
+
+ ;; M_last = M_n XOR K1
+ mov tmp3, [job + _skey1]
+ movdqu xmm0, [tmp3]
+ movdqu xmm1, [tmp2]
+ pxor xmm0, xmm1
+ movdqa [m_last], xmm0
+
+ jmp %%_step_5
+
+%%_lt_one_block:
+ ;; Single partial block
+ mov word [state + _aes_cmac_init_done + lane*2], 1
+ mov [state + _aes_cmac_args_in + lane*8], m_last
+
+ movdqa xmm0, [state + _aes_cmac_lens]
+ XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
+ movdqa [state + _aes_cmac_lens], xmm0
+
+ mov n, 1
+ jmp %%_not_complete_block
+
+%%_not_complete_block_3gpp:
+ ;; bit pad last block
+ ;; xor with skey2
+ ;; copy to m_last
+
+ ;; load pointer to src
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ ;; check if partial block
+ or r, r
+ jz %%_load_full_block_3gpp
+
+ simd_load_sse_15_1 xmm0, tmp, r
+ dec r
+
+%%_update_mlast_3gpp:
+ ;; set last byte padding mask
+ ;; shift into correct xmm idx
+
+ ;; save and restore rcx on windows
+%ifndef LINUX
+ mov tmp, rcx
+%endif
+ mov rcx, rbits
+ mov tmp3, 0xff
+ shr tmp3, cl
+ movq xmm2, tmp3
+ XPSLLB xmm2, r, xmm1, tmp2
+
+ ;; pad final byte
+ pandn xmm2, xmm0
+%ifndef LINUX
+ mov rcx, tmp
+%endif
+ ;; set OR mask to pad final bit
+ mov tmp2, tmp3
+ shr tmp2, 1
+ xor tmp2, tmp3 ; XOR to get OR mask
+ movq xmm3, tmp2
+ ;; xmm1 contains shift table from previous shift
+ pshufb xmm3, xmm1
+
+ ;; load skey2 address
+ mov tmp3, [job + _skey2]
+ movdqu xmm1, [tmp3]
+
+ ;; set final padding bit
+ por xmm2, xmm3
+
+ ;; XOR last partial block with skey2
+ ;; update mlast
+ pxor xmm2, xmm1
+ movdqa [m_last], xmm2
+
+ jmp %%_step_5
+
+%%_load_full_block_3gpp:
+ movdqu xmm0, [tmp]
+ mov r, 0xf
+ jmp %%_update_mlast_3gpp
+%endif
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
+SUBMIT_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
+FLUSH_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif