diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm | 466 |
1 files changed, 466 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm new file mode 100644 index 000000000..d614e1b0e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm @@ -0,0 +1,466 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rbx rcx rsi rdi rbp +;; +;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rbx rcx rdi rbp r8 +;; +;; clobbers ymm0-15 + +%include "include/os.asm" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" +%include "mb_mgr_datastruct.asm" +%include "include/transpose_avx2.asm" + +section .data +default rel +align 32 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 r8 +%endif + +%define state arg1 +%define num_blks arg2 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 reg3 + +%define IDX rax + +; ymm0 A +; ymm1 B +; ymm2 C +; ymm3 D +; ymm4 E +; ymm5 F AA +; ymm6 T0 BB +; ymm7 T1 CC +; ymm8 T2 DD +; ymm9 T3 EE +; ymm10 T4 TMP +; ymm11 T5 FUN +; ymm12 T6 K +; ymm13 T7 W14 +; ymm14 T8 W15 +; ymm15 T9 W16 + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 + +%define F ymm5 +%define T0 ymm6 +%define T1 ymm7 +%define T2 ymm8 +%define T3 ymm9 +%define T4 ymm10 +%define T5 ymm11 +%define T6 ymm12 +%define T7 ymm13 +%define T8 ymm14 +%define T9 ymm15 + +%define AA ymm5 +%define BB ymm6 +%define CC ymm7 +%define DD ymm8 +%define EE ymm9 +%define TMP ymm10 +%define FUN ymm11 +%define K ymm12 +%define W14 ymm13 +%define W15 ymm14 +%define W16 ymm15 + + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESIZE mod 32 must be 32-8 = 24 +%define FRAMESZ 32*16 + 24 + +%define VMOVPS vmovups + +;; +;; Magic functions defined in FIPS 180-1 +;; +;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + ;vmovdqa %%regF,%%regC + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + ;vmovdqa %%regF,%%regD + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + + + +;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + ;vmovdqa %%regF,%%regB + ;vmovdqa %%regT,%%regB + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + ;vmovdqa %%tmp, %%reg + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + ;vmovdqa %%tmp, %%reg + vpsrld %%tmp, %%src, (32-%%imm) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 32)] + ;vmovdqa %%regT,%%regA + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32] + + ;vmovdqa %%regF, W16 + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + + ;vmovdqa %%regT,%%regA + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +align 32 + +; void sha1_x8_avx2(void *state, int num_blks) +; arg 1 : rcx : pointer to array[4] of pointer to input data +; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 +MKGLOBAL(sha1_x8_avx2,function,internal) +sha1_x8_avx2: + sub rsp, FRAMESZ + + ;; Initialize digests + vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE] + vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE] + vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE] + vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE] + vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE] + DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E + + ;; transpose input onto stack + mov inp0,[state+_data_ptr_sha1+0*PTR_SZ] + mov inp1,[state+_data_ptr_sha1+1*PTR_SZ] + mov inp2,[state+_data_ptr_sha1+2*PTR_SZ] + mov inp3,[state+_data_ptr_sha1+3*PTR_SZ] + mov inp4,[state+_data_ptr_sha1+4*PTR_SZ] + mov inp5,[state+_data_ptr_sha1+5*PTR_SZ] + mov inp6,[state+_data_ptr_sha1+6*PTR_SZ] + mov inp7,[state+_data_ptr_sha1+7*PTR_SZ] + + xor IDX, IDX +lloop: + vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 2 + TRANSPOSE8_U32_LOAD8 T0, T1, T2, T3, T4, T5, T6, T7, \ + inp0, inp1, inp2, inp3, inp4, inp5, \ + inp6, inp7, IDX + + TRANSPOSE8_U32 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + vpshufb T0, T0, F + vmovdqa [rsp+(I*8+0)*32],T0 + vpshufb T1, T1, F + vmovdqa [rsp+(I*8+1)*32],T1 + vpshufb T2, T2, F + vmovdqa [rsp+(I*8+2)*32],T2 + vpshufb T3, T3, F + vmovdqa [rsp+(I*8+3)*32],T3 + vpshufb T4, T4, F + vmovdqa [rsp+(I*8+4)*32],T4 + vpshufb T5, T5, F + vmovdqa [rsp+(I*8+5)*32],T5 + vpshufb T6, T6, F + vmovdqa [rsp+(I*8+6)*32],T6 + vpshufb T7, T7, F + vmovdqa [rsp+(I*8+7)*32],T7 + add IDX, 32 +%assign I (I+1) +%endrep + + + ; save old digests + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E + +;; +;; perform 0-79 steps +;; + vmovdqa K, [rel K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [rsp + ((16 - 16) & 15) * 32] + vmovdqa W15, [rsp + ((16 - 15) & 15) * 32] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqa K, [rel K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqa K, [rel K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqa K, [rel K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub num_blks, 1 + jne lloop + + ; write out digests + vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A + vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B + vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C + vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D + vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E + DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [state+_data_ptr_sha1+0*PTR_SZ], inp0 + mov [state+_data_ptr_sha1+1*PTR_SZ], inp1 + mov [state+_data_ptr_sha1+2*PTR_SZ], inp2 + mov [state+_data_ptr_sha1+3*PTR_SZ], inp3 + mov [state+_data_ptr_sha1+4*PTR_SZ], inp4 + mov [state+_data_ptr_sha1+5*PTR_SZ], inp5 + mov [state+_data_ptr_sha1+6*PTR_SZ], inp6 + mov [state+_data_ptr_sha1+7*PTR_SZ], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + ;; Clear stack frame (16*32 bytes) +%ifdef SAFE_DATA + vpxor ymm0, ymm0 +%assign i 0 +%rep 16 + vmovdqa [rsp + i*32], ymm0 +%assign i (i+1) +%endrep +%endif + + add rsp, FRAMESZ + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |