summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm466
1 files changed, 466 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
new file mode 100644
index 000000000..d614e1b0e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
@@ -0,0 +1,466 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rbx rcx rsi rdi rbp
+;;
+;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rbx rcx rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx2.asm"
+
+section .data
+default rel
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 r8
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg3
+
+%define IDX rax
+
+; ymm0 A
+; ymm1 B
+; ymm2 C
+; ymm3 D
+; ymm4 E
+; ymm5 F AA
+; ymm6 T0 BB
+; ymm7 T1 CC
+; ymm8 T2 DD
+; ymm9 T3 EE
+; ymm10 T4 TMP
+; ymm11 T5 FUN
+; ymm12 T6 K
+; ymm13 T7 W14
+; ymm14 T8 W15
+; ymm15 T9 W16
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+%define FRAMESZ 32*16 + 24
+
+%define VMOVPS vmovups
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regC
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regD
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regB
+ ;vmovdqa %%regT,%%regB
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ ;vmovdqa %%tmp, %%reg
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ ;vmovdqa %%tmp, %%reg
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
+ ;vmovdqa %%regT,%%regA
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
+
+ ;vmovdqa %%regF, W16
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ ;vmovdqa %%regT,%%regA
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; void sha1_x8_avx2(void *state, int num_blks)
+; arg 1 : rcx : pointer to array[4] of pointer to input data
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_x8_avx2,function,internal)
+sha1_x8_avx2:
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE]
+ DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E
+
+ ;; transpose input onto stack
+ mov inp0,[state+_data_ptr_sha1+0*PTR_SZ]
+ mov inp1,[state+_data_ptr_sha1+1*PTR_SZ]
+ mov inp2,[state+_data_ptr_sha1+2*PTR_SZ]
+ mov inp3,[state+_data_ptr_sha1+3*PTR_SZ]
+ mov inp4,[state+_data_ptr_sha1+4*PTR_SZ]
+ mov inp5,[state+_data_ptr_sha1+5*PTR_SZ]
+ mov inp6,[state+_data_ptr_sha1+6*PTR_SZ]
+ mov inp7,[state+_data_ptr_sha1+7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+ TRANSPOSE8_U32_LOAD8 T0, T1, T2, T3, T4, T5, T6, T7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ TRANSPOSE8_U32 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*8+0)*32],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*8+1)*32],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*8+2)*32],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*8+3)*32],T3
+ vpshufb T4, T4, F
+ vmovdqa [rsp+(I*8+4)*32],T4
+ vpshufb T5, T5, F
+ vmovdqa [rsp+(I*8+5)*32],T5
+ vpshufb T6, T6, F
+ vmovdqa [rsp+(I*8+6)*32],T6
+ vpshufb T7, T7, F
+ vmovdqa [rsp+(I*8+7)*32],T7
+ add IDX, 32
+%assign I (I+1)
+%endrep
+
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [rel K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [rel K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [rel K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub num_blks, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E
+ DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state+_data_ptr_sha1+0*PTR_SZ], inp0
+ mov [state+_data_ptr_sha1+1*PTR_SZ], inp1
+ mov [state+_data_ptr_sha1+2*PTR_SZ], inp2
+ mov [state+_data_ptr_sha1+3*PTR_SZ], inp3
+ mov [state+_data_ptr_sha1+4*PTR_SZ], inp4
+ mov [state+_data_ptr_sha1+5*PTR_SZ], inp5
+ mov [state+_data_ptr_sha1+6*PTR_SZ], inp6
+ mov [state+_data_ptr_sha1+7*PTR_SZ], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame (16*32 bytes)
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+%assign i 0
+%rep 16
+ vmovdqa [rsp + i*32], ymm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, FRAMESZ
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif