diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm | 495 |
1 files changed, 495 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm new file mode 100644 index 00000000..26e2de46 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm @@ -0,0 +1,495 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 + +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX R8 R9 R10 R11 +;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX RCX RDX R10 R11 +;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Clobbers ZMM0-31, K1-7 (K1-2 and K4-6 here but DES underneath clobbers K1-7). + +%include "os.asm" +%include "reg_sizes.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "constants.asm" +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern docsis_des_x16_enc_avx512 +extern docsis_des_x16_dec_avx512 +extern des_x16_cbc_enc_avx512 +extern des_x16_cbc_dec_avx512 +extern des3_x16_cbc_enc_avx512 +extern des3_x16_cbc_dec_avx512 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define STATE arg1 +%define JOB arg2 + +%define IA0 arg3 +%define IA1 arg4 +%define IA2 r10 + +%define MIN_IDX r11 +%define MIN_LEN rax +%define LANE r11 + +%define AVX512_NUM_DES_LANES 16 + +%define ZTMP0 zmm0 +%define ZTMP1 zmm1 +%define ZTMP2 zmm2 +%define ZTMP3 zmm3 +%define ZTMP4 zmm4 +%define ZTMP5 zmm5 +%define ZTMP6 zmm6 +%define ZTMP7 zmm7 +%define ZTMP8 zmm8 +%define ZTMP9 zmm9 + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +;;; =========================================================================== +;;; DES/DOCSIS DES job submit +;;; =========================================================================== +;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection +;;; ENC_DEC [in] - ENCrypt or DECrypt seection +%macro GENERIC_DES_SUBMIT 2 +%define %%DES_DOCSIS %1 +%define %%ENC_DEC %2 + + ;; get unsued lane and increment number of lanes in use + mov IA0, [STATE + _des_unused_lanes] + mov LANE, IA0 + and LANE, 0xF ;; just a nibble + shr IA0, 4 + mov [STATE + _des_unused_lanes], IA0 + add qword [STATE + _des_lanes_in_use], 1 + + ;; store job info in OOO structure + ;; - job pointer + mov [STATE + _des_job_in_lane + LANE*8], JOB + ;; - key schedule +%ifidn %%ENC_DEC, ENC + mov IA2, [JOB + _aes_enc_key_expanded] +%else + mov IA2, [JOB + _aes_dec_key_expanded] +%endif + mov [STATE + _des_args_keys + LANE*8], IA2 + ;; - IV + mov IA2, [JOB + _iv] + mov DWORD(IA0), [IA2] + mov DWORD(IA1), [IA2 + 4] + mov [STATE + _des_args_IV + LANE*4], DWORD(IA0) + mov [STATE + _des_args_IV + LANE*4 + (AVX512_NUM_DES_LANES*4)], DWORD(IA1) + ;; - src pointer + mov IA0, [JOB + _src] + add IA0, [JOB + _cipher_start_src_offset_in_bytes] + mov [STATE + _des_args_in + LANE*8], IA0 + ;; - destination pointer + mov IA1, [JOB + _dst] + mov [STATE + _des_args_out + LANE*8], IA1 + ;; - length in bytes (block aligned) + mov IA2, [JOB + _msg_len_to_cipher_in_bytes] + and IA2, -8 + mov [STATE + _des_lens + LANE*2], WORD(IA2) +%ifidn %%DES_DOCSIS, DOCSIS + ;; - block length + mov [STATE + _des_args_BLen + LANE*4], DWORD(IA2) + ;; - last in + add IA0, IA2 + mov [STATE + _des_args_LIn + LANE*8], IA0 + ;; - last out + add IA1, IA2 + mov [STATE + _des_args_LOut + LANE*8], IA1 + ;; - partial length + mov IA2, [JOB + _msg_len_to_cipher_in_bytes] + and IA2, 7 + mov [STATE + _des_args_PLen + LANE*4], DWORD(IA2) +%endif ; DOCSIS + ;; is there enough jobs to process them in parallel? + cmp qword [STATE + _des_lanes_in_use], AVX512_NUM_DES_LANES + jb %%_des_submit_null_end + ;; schedule the processing + ;; - find min job size + vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0] + vphminposuw XWORD(ZTMP2), XWORD(ZTMP0) + vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value + vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index + vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8] + vphminposuw XWORD(ZTMP2), XWORD(ZTMP1) + vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value + cmp DWORD(MIN_LEN), DWORD(IA2) + jle %%_use_min + vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index + add DWORD(MIN_IDX), 8 ; but index +8 + mov MIN_LEN, IA2 ; min len +%%_use_min: + cmp MIN_LEN, 0 + je %%_len_is_0 + + vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN) + vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3) + vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0) + vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3) + vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1) + + push MIN_IDX + mov arg2, MIN_LEN +%ifidn %%ENC_DEC, ENC + ;; encrypt +%ifidn %%DES_DOCSIS, DOCSIS + call docsis_des_x16_enc_avx512 +%endif +%ifidn %%DES_DOCSIS, DES + call des_x16_cbc_enc_avx512 +%endif +%ifidn %%DES_DOCSIS, 3DES + call des3_x16_cbc_enc_avx512 +%endif +%else ; ENC + ;; decrypt +%ifidn %%DES_DOCSIS, DOCSIS + call docsis_des_x16_dec_avx512 +%endif +%ifidn %%DES_DOCSIS, DES + call des_x16_cbc_dec_avx512 +%endif +%ifidn %%DES_DOCSIS, 3DES + call des3_x16_cbc_dec_avx512 +%endif +%endif ; DEC + pop MIN_IDX + jmp %%_des_submit_end + +%%_des_submit_null_end: + xor rax, rax + jmp %%_des_submit_return + +%%_len_is_0: +%ifidn %%DES_DOCSIS, DOCSIS + cmp dword [STATE + _des_args_PLen + MIN_IDX*4], 0 + jz %%_des_submit_end + push MIN_IDX + xor arg2, arg2 ; len is 0 +%ifidn %%ENC_DEC, ENC + call docsis_des_x16_enc_avx512 +%else ; ENC + call docsis_des_x16_dec_avx512 +%endif ; DEC + pop MIN_IDX +%endif ; DOCSIS + ;; fall trough +%%_des_submit_end: + ;; return a job + ;; - decrement number of jobs in use + sub qword [STATE + _des_lanes_in_use], 1 + ;; - put the lane back to free lanes pool + mov IA0, [STATE + _des_unused_lanes] + shl IA0, 4 + or IA0, MIN_IDX + mov [STATE + _des_unused_lanes], IA0 + ;; - mark job as complete + ;; - clear job pointer + mov rax, [STATE + _des_job_in_lane + MIN_IDX*8] + mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0 + or dword [rax + _status], STS_COMPLETED_AES + vzeroupper +%%_des_submit_return: +%endmacro + +;;; =========================================================================== +;;; DES/DOCSIS DES flush +;;; =========================================================================== +;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection +;;; ENC_DEC [in] - ENCrypt or DECrypt selection +;;; +;;; Clobbers k1, k2, k4, k5 and k6 +%macro GENERIC_DES_FLUSH 2 +%define %%DES_DOCSIS %1 +%define %%ENC_DEC %2 + + cmp qword [STATE + _des_lanes_in_use], 0 + je %%_des_flush_null_end + + ;; find non-null job + vpxord ZTMP0, ZTMP0, ZTMP0 + vmovdqu64 ZTMP1, [STATE + _des_job_in_lane + (0*PTR_SZ)] + vmovdqu64 ZTMP2, [STATE + _des_job_in_lane + (8*PTR_SZ)] + vpcmpq k1, ZTMP1, ZTMP0, 4 ; NEQ + vpcmpq k2, ZTMP2, ZTMP0, 4 ; NEQ + xor IA0, IA0 + xor IA1, IA1 + kmovw DWORD(IA0), k1 + kmovw DWORD(IA1), k2 + mov DWORD(IA2), DWORD(IA1) + shl DWORD(IA2), 8 + or DWORD(IA2), DWORD(IA0) ; mask of non-null jobs in IA2 + not BYTE(IA0) + kmovw k4, DWORD(IA0) + not BYTE(IA1) + kmovw k5, DWORD(IA1) + mov DWORD(IA0), DWORD(IA2) + not WORD(IA0) + kmovw k6, DWORD(IA0) ; mask of NULL jobs in k4, k5 and k6 + mov DWORD(IA0), DWORD(IA2) + xor IA2, IA2 + bsf WORD(IA2), WORD(IA0) ; index of the 1st set bit in IA2 + + ;; copy good lane data into NULL lanes + ;; - k1(L8)/k2(H8) - masks of non-null jobs + ;; - k4(L8)/k5(H8)/k6 - masks of NULL jobs + ;; - IA2 index of 1st non-null job + + ;; - in pointer + mov IA0, [STATE + _des_args_in + IA2*8] + vpbroadcastq ZTMP1, IA0 + vmovdqu64 [STATE + _des_args_in + (0*PTR_SZ)]{k4}, ZTMP1 + vmovdqu64 [STATE + _des_args_in + (8*PTR_SZ)]{k5}, ZTMP1 + ;; - out pointer + mov IA0, [STATE + _des_args_out + IA2*8] + vpbroadcastq ZTMP1, IA0 + vmovdqu64 [STATE + _des_args_out + (0*PTR_SZ)]{k4}, ZTMP1 + vmovdqu64 [STATE + _des_args_out + (8*PTR_SZ)]{k5}, ZTMP1 + ;; - key schedule + mov IA0, [STATE + _des_args_keys + IA2*8] + vpbroadcastq ZTMP1, IA0 + vmovdqu64 [STATE + _des_args_keys + (0*PTR_SZ)]{k4}, ZTMP1 + vmovdqu64 [STATE + _des_args_keys + (8*PTR_SZ)]{k5}, ZTMP1 + ;; - zero partial len + vmovdqu32 [STATE + _des_args_PLen]{k6}, ZTMP0 + ;; - set len to UINT16_MAX + mov WORD(IA0), 0xffff + vpbroadcastw ZTMP1, WORD(IA0) + vmovdqu16 [STATE + _des_lens]{k6}, ZTMP1 + + ;; - IV + mov DWORD(IA0), [STATE + _des_args_IV + IA2*4] + mov DWORD(IA1), [STATE + _des_args_IV + IA2*4 + (16*4)] + vpbroadcastd ZTMP1, DWORD(IA0) + vpbroadcastd ZTMP2, DWORD(IA1) + vmovdqu32 [STATE + _des_args_IV]{k6}, ZTMP1 + vmovdqu32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP2 + + ;; schedule the processing + ;; - find min job size + vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0] + vphminposuw XWORD(ZTMP2), XWORD(ZTMP0) + vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value + vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index + vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8] + vphminposuw XWORD(ZTMP2), XWORD(ZTMP1) + vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value + cmp DWORD(MIN_LEN), DWORD(IA2) + jle %%_use_min + vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index + add DWORD(MIN_IDX), 8 ; but index +8 + mov MIN_LEN, IA2 ; min len +%%_use_min: + vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN) + vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3) + vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0) + vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3) + vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1) + + push MIN_IDX + mov arg2, MIN_LEN +%ifidn %%ENC_DEC, ENC + ;; encrypt +%ifidn %%DES_DOCSIS, DOCSIS + call docsis_des_x16_enc_avx512 +%endif +%ifidn %%DES_DOCSIS, DES + call des_x16_cbc_enc_avx512 +%endif +%ifidn %%DES_DOCSIS, 3DES + call des3_x16_cbc_enc_avx512 +%endif +%else ; ENC + ;; decrypt +%ifidn %%DES_DOCSIS, DOCSIS + call docsis_des_x16_dec_avx512 +%endif +%ifidn %%DES_DOCSIS, DES + call des_x16_cbc_dec_avx512 +%endif +%ifidn %%DES_DOCSIS, 3DES + call des3_x16_cbc_dec_avx512 +%endif +%endif ; DEC + pop MIN_IDX + jmp %%_des_flush_end + +%%_des_flush_null_end: + xor rax, rax + jmp %%_des_flush_return +%%_des_flush_end: + ;; return a job + ;; - decrement number of jobs in use + sub qword [STATE + _des_lanes_in_use], 1 + ;; - put the lane back to free lanes pool + mov IA0, [STATE + _des_unused_lanes] + shl IA0, 4 + or IA0, MIN_IDX + mov [STATE + _des_unused_lanes], IA0 + ;; - mark job as complete + mov rax, [STATE + _des_job_in_lane + MIN_IDX*8] + or dword [rax + _status], STS_COMPLETED_AES + ;; - clear job pointer + mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0 +%%_des_flush_return: + vzeroupper +%endmacro + +;;; ======================================================== +;;; DATA + +section .data +default rel + +;;; ======================================================== +;;; CODE +section .text + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_des_cbc_enc_avx512,function,internal) +submit_job_des_cbc_enc_avx512: + GENERIC_DES_SUBMIT DES, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_des_cbc_dec_avx512,function,internal) +submit_job_des_cbc_dec_avx512: + GENERIC_DES_SUBMIT DES, DEC + ret + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_docsis_des_enc_avx512,function,internal) +submit_job_docsis_des_enc_avx512: + GENERIC_DES_SUBMIT DOCSIS, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_docsis_des_dec_avx512,function,internal) +submit_job_docsis_des_dec_avx512: + GENERIC_DES_SUBMIT DOCSIS, DEC + ret + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_3des_cbc_enc_avx512,function,internal) +submit_job_3des_cbc_enc_avx512: + GENERIC_DES_SUBMIT 3DES, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +;;; arg 2 : job +align 64 +MKGLOBAL(submit_job_3des_cbc_dec_avx512,function,internal) +submit_job_3des_cbc_dec_avx512: + GENERIC_DES_SUBMIT 3DES, DEC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_des_cbc_enc_avx512,function,internal) +flush_job_des_cbc_enc_avx512: + GENERIC_DES_FLUSH DES, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_des_cbc_dec_avx512,function,internal) +flush_job_des_cbc_dec_avx512: + GENERIC_DES_FLUSH DES, DEC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_docsis_des_enc_avx512,function,internal) +flush_job_docsis_des_enc_avx512: + GENERIC_DES_FLUSH DOCSIS, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_docsis_des_dec_avx512,function,internal) +flush_job_docsis_des_dec_avx512: + GENERIC_DES_FLUSH DOCSIS, DEC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_3des_cbc_enc_avx512,function,internal) +flush_job_3des_cbc_enc_avx512: + GENERIC_DES_FLUSH 3DES, ENC + ret + +;;; arg 1 : pointer to DES OOO structure +align 64 +MKGLOBAL(flush_job_3des_cbc_dec_avx512,function,internal) +flush_job_3des_cbc_dec_avx512: + GENERIC_DES_FLUSH 3DES, DEC + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |