summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm485
1 files changed, 485 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
new file mode 100644
index 000000000..aeb00a008
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
@@ -0,0 +1,485 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
+_WK_SAVE_SIZE equ 16*4
+
+_WK_SAVE equ 0
+_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+; rax~rdx, rsi, rdi, rbp are used for RR
+%define N_MGR r8
+%define IDX r9 ; local variable -- consistent with caller
+%define K_BASE r11
+%define BUFFER_PTR r12
+%define BUFFER_END r13
+%define TMP r14 ; local variable -- assistant to address digest
+
+%xdefine W_TMP xmm0
+%xdefine W_TMP2 xmm9
+
+%xdefine W0 xmm1
+%xdefine W4 xmm2
+%xdefine W8 xmm3
+%xdefine W12 xmm4
+%xdefine W16 xmm5
+%xdefine W20 xmm6
+%xdefine W24 xmm7
+%xdefine W28 xmm8
+
+%xdefine XMM_SHUFB_BSWAP xmm10
+
+;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
+%xdefine WK(t) (rsp + (t & 15)*4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Constants
+
+%xdefine K1 0x5a827999
+%xdefine K2 0x6ed9eba1
+%xdefine K3 0x8f1bbcdc
+%xdefine K4 0xca62c1d6
+
+%xdefine W_PRECALC_AHEAD 16
+%xdefine W_NO_TAIL_PRECALC 0
+
+; Rounds macros
+
+%macro REGALLOC 0
+ %xdefine A ecx
+ %xdefine B esi
+ %xdefine C edi
+ %xdefine D ebp
+ %xdefine E edx
+
+ %xdefine T1 eax
+ %xdefine T2 ebx
+%endmacro
+
+%macro F1 3
+ mov T1,%2
+ xor T1,%3
+ and T1,%1
+ xor T1,%3
+%endmacro
+
+%macro F2 3
+ mov T1,%3
+ xor T1,%2
+ xor T1,%1
+%endmacro
+
+%macro F3 3
+ mov T1,%2
+ mov T2,%1
+ or T1,%1
+ and T2,%2
+ and T1,%3
+ or T1,T2
+%endmacro
+
+%define F4 F2
+
+%macro UPDATE_HASH 2
+ add %2, %1
+ mov %1, %2
+%endmacro
+
+
+%macro W_PRECALC 1
+ %xdefine i (%1)
+
+ %if (i < 20)
+ %xdefine K_XMM 0
+ %elif (i < 40)
+ %xdefine K_XMM 16
+ %elif (i < 60)
+ %xdefine K_XMM 32
+ %else
+ %xdefine K_XMM 48
+ %endif
+
+ %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
+
+ %if (W_NO_TAIL_PRECALC == 0)
+
+ %xdefine i ((%1) % 80) ;; pre-compute for the next iteration
+
+ %if (i == 0)
+ W_PRECALC_RESET
+ %endif
+
+
+ W_PRECALC_00_15
+ %endif
+
+ %elif (i < 32)
+ W_PRECALC_16_31
+ %elif (i < 80) ;; rounds 32-79
+ W_PRECALC_32_79
+ %endif
+%endmacro
+
+%macro W_PRECALC_RESET 0
+ %xdefine W W0
+ %xdefine W_minus_04 W4
+ %xdefine W_minus_08 W8
+ %xdefine W_minus_12 W12
+ %xdefine W_minus_16 W16
+ %xdefine W_minus_20 W20
+ %xdefine W_minus_24 W24
+ %xdefine W_minus_28 W28
+ %xdefine W_minus_32 W
+%endmacro
+
+%macro W_PRECALC_ROTATE 0
+ %xdefine W_minus_32 W_minus_28
+ %xdefine W_minus_28 W_minus_24
+ %xdefine W_minus_24 W_minus_20
+ %xdefine W_minus_20 W_minus_16
+ %xdefine W_minus_16 W_minus_12
+ %xdefine W_minus_12 W_minus_08
+ %xdefine W_minus_08 W_minus_04
+ %xdefine W_minus_04 W
+ %xdefine W W_minus_32
+%endmacro
+
+%macro W_PRECALC_00_15 0
+ ;; message scheduling pre-compute for rounds 0-15
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqu W_TMP, [BUFFER_PTR + (i * 4)]
+ %elif ((i & 3) == 1)
+ pshufb W_TMP, XMM_SHUFB_BSWAP
+ movdqa W, W_TMP
+ %elif ((i & 3) == 2)
+ paddd W_TMP, [K_BASE]
+ %elif ((i & 3) == 3)
+ movdqa [WK(i&~3)], W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_16_31 0
+ ;; message scheduling pre-compute for rounds 16-31
+ ;; calculating last 32 w[i] values in 8 XMM registers
+ ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
+ ;;
+ ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W, W_minus_12
+ palignr W, W_minus_16, 8 ;; w[i-14]
+ movdqa W_TMP, W_minus_04
+ psrldq W_TMP, 4 ;; w[i-3]
+ pxor W, W_minus_08
+ %elif ((i & 3) == 1)
+ pxor W_TMP, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP2, W
+ movdqa W_TMP, W
+ pslldq W_TMP2, 12
+ %elif ((i & 3) == 2)
+ psrld W, 31
+ pslld W_TMP, 1
+ por W_TMP, W
+ movdqa W, W_TMP2
+ psrld W_TMP2, 30
+ pslld W, 2
+ %elif ((i & 3) == 3)
+ pxor W_TMP, W
+ pxor W_TMP, W_TMP2
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_32_79 0
+ ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W_TMP, W_minus_04
+ pxor W, W_minus_28 ;; W is W_minus_32 before xor
+ palignr W_TMP, W_minus_08, 8
+ %elif ((i & 3) == 1)
+ pxor W, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP, W
+ %elif ((i & 3) == 2)
+ psrld W, 30
+ pslld W_TMP, 2
+ por W_TMP, W
+ %elif ((i & 3) == 3)
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation
+
+ ;; TEMP = A
+ ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
+ ;; C = ROTATE_LEFT( B, 30 )
+ ;; D = C
+ ;; E = D
+ ;; B = TEMP
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD)
+ F %2, %3, %4 ;; F returns result in T1
+ add %5, [WK(%6)]
+ rol %2, 30
+ mov T2, %1
+ add %4, [WK(%6 + 1)]
+ rol T2, 5
+ add %5, T1
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
+ add T2, %5
+ mov %5, T2
+ rol T2, 5
+ add %4, T2
+ F %1, %2, %3 ;; F returns result in T1
+ add %4, T1
+ rol %1, 30
+
+;; write: %1, %2
+;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs (except r15), xmm0-xmm10
+; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+mk_global sha1_opt_x1, function, internal
+sha1_opt_x1:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ mov [rsp + _GPR_SAVE + 8*3], rsi
+ ; caller has already stored XMM6~10
+%endif
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+ mov [rsp + _GPR_SAVE + 8*8], rdx
+
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz .lend
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ ;; let sha1_opt sb takes over r8~r11
+ ;; Load input pointers
+ mov N_MGR, MGR
+ mov BUFFER_PTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, BUFFER_PTR
+ mov BUFFER_END, NBLK
+
+ lea K_BASE, [K_XMM_AR]
+ movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
+
+ REGALLOC
+
+ lea TMP, [N_MGR + 4*IDX]
+ ;; Initialize digest
+ mov A, [TMP + 0*NLANX4]
+ mov B, [TMP + 1*NLANX4]
+ mov C, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ mov D, [TMP + 1*NLANX4]
+ mov E, [TMP + 2*NLANX4]
+
+ %assign i 0
+ %rep W_PRECALC_AHEAD
+ W_PRECALC i
+ %assign i i+1
+ %endrep
+
+ %xdefine F F1
+
+.lloop:
+ cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block,
+ jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE
+ jmp .lend
+
+.lbegin:
+ RR A,B,C,D,E,0
+ RR D,E,A,B,C,2
+ RR B,C,D,E,A,4
+ RR E,A,B,C,D,6
+ RR C,D,E,A,B,8
+
+ RR A,B,C,D,E,10
+ RR D,E,A,B,C,12
+ RR B,C,D,E,A,14
+ RR E,A,B,C,D,16
+ RR C,D,E,A,B,18
+
+ %xdefine F F2
+
+ RR A,B,C,D,E,20
+ RR D,E,A,B,C,22
+ RR B,C,D,E,A,24
+ RR E,A,B,C,D,26
+ RR C,D,E,A,B,28
+
+ RR A,B,C,D,E,30
+ RR D,E,A,B,C,32
+ RR B,C,D,E,A,34
+ RR E,A,B,C,D,36
+ RR C,D,E,A,B,38
+
+ %xdefine F F3
+
+ RR A,B,C,D,E,40
+ RR D,E,A,B,C,42
+ RR B,C,D,E,A,44
+ RR E,A,B,C,D,46
+ RR C,D,E,A,B,48
+
+ RR A,B,C,D,E,50
+ RR D,E,A,B,C,52
+ RR B,C,D,E,A,54
+ RR E,A,B,C,D,56
+ RR C,D,E,A,B,58
+
+ %xdefine F F4
+
+ add BUFFER_PTR, 64 ;; move to next 64-byte block
+ cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one
+ cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration
+
+ RR A,B,C,D,E,60
+ RR D,E,A,B,C,62
+ RR B,C,D,E,A,64
+ RR E,A,B,C,D,66
+ RR C,D,E,A,B,68
+
+ RR A,B,C,D,E,70
+ RR D,E,A,B,C,72
+ RR B,C,D,E,A,74
+ RR E,A,B,C,D,76
+ RR C,D,E,A,B,78
+
+ lea TMP, [N_MGR + 4*IDX]
+ UPDATE_HASH [TMP + 0*NLANX4],A
+ UPDATE_HASH [TMP + 1*NLANX4],B
+ UPDATE_HASH [TMP + 2*NLANX4],C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ UPDATE_HASH [TMP + 1*NLANX4],D
+ UPDATE_HASH [TMP + 2*NLANX4],E
+
+ jmp .lloop
+
+ .lend:
+ mov MGR, N_MGR
+
+ mov rdx, [rsp + _GPR_SAVE + 8*8]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + _GPR_SAVE + 8*3]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbp, [rsp + _GPR_SAVE + 8*1]
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+
+;;----------------------
+section .data align=64
+
+align 128
+K_XMM_AR:
+ DD K1, K1, K1, K1
+ DD K2, K2, K2, K2
+ DD K3, K3, K3, K3
+ DD K4, K4, K4, K4
+
+align 16
+bswap_shufb_ctl:
+ DD 00010203h
+ DD 04050607h
+ DD 08090a0bh
+ DD 0c0d0e0fh