;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "sha1_mb_mgr_datastruct.asm" %include "reg_sizes.asm" [bits 64] default rel section .text ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifidn __OUTPUT_FORMAT__, elf64 ; Linux %define arg0 rdi %define arg1 rsi %else ; Windows %define arg0 rcx %define arg1 rdx %endif ;; FRAMESZ plus pushes must be an odd multiple of 8 _GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 _WK_SAVE_SIZE equ 16*4 _WK_SAVE equ 0 _GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE ; arg index is start from 0 while mgr_flush/submit is from 1 %define MGR arg0 %define NBLK arg1 %define NLANX4 r10 ; consistent with caller ; rax~rdx, rsi, rdi, rbp are used for RR %define N_MGR r8 %define IDX r9 ; local variable -- consistent with caller %define K_BASE r11 %define BUFFER_PTR r12 %define BUFFER_END r13 %define TMP r14 ; local variable -- assistant to address digest %xdefine W_TMP xmm0 %xdefine W_TMP2 xmm9 %xdefine W0 xmm1 %xdefine W4 xmm2 %xdefine W8 xmm3 %xdefine W12 xmm4 %xdefine W16 xmm5 %xdefine W20 xmm6 %xdefine W24 xmm7 %xdefine W28 xmm8 %xdefine XMM_SHUFB_BSWAP xmm10 ;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer %xdefine WK(t) (rsp + (t & 15)*4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Constants %xdefine K1 0x5a827999 %xdefine K2 0x6ed9eba1 %xdefine K3 0x8f1bbcdc %xdefine K4 0xca62c1d6 %xdefine W_PRECALC_AHEAD 16 %xdefine W_NO_TAIL_PRECALC 0 ; Rounds macros %macro REGALLOC 0 %xdefine A ecx %xdefine B esi %xdefine C edi %xdefine D ebp %xdefine E edx %xdefine T1 eax %xdefine T2 ebx %endmacro %macro F1 3 mov T1,%2 xor T1,%3 and T1,%1 xor T1,%3 %endmacro %macro F2 3 mov T1,%3 xor T1,%2 xor T1,%1 %endmacro %macro F3 3 mov T1,%2 mov T2,%1 or T1,%1 and T2,%2 and T1,%3 or T1,T2 %endmacro %define F4 F2 %macro UPDATE_HASH 2 add %2, %1 mov %1, %2 %endmacro %macro W_PRECALC 1 %xdefine i (%1) %if (i < 20) %xdefine K_XMM 0 %elif (i < 40) %xdefine K_XMM 16 %elif (i < 60) %xdefine K_XMM 32 %else %xdefine K_XMM 48 %endif %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) %if (W_NO_TAIL_PRECALC == 0) %xdefine i ((%1) % 80) ;; pre-compute for the next iteration %if (i == 0) W_PRECALC_RESET %endif W_PRECALC_00_15 %endif %elif (i < 32) W_PRECALC_16_31 %elif (i < 80) ;; rounds 32-79 W_PRECALC_32_79 %endif %endmacro %macro W_PRECALC_RESET 0 %xdefine W W0 %xdefine W_minus_04 W4 %xdefine W_minus_08 W8 %xdefine W_minus_12 W12 %xdefine W_minus_16 W16 %xdefine W_minus_20 W20 %xdefine W_minus_24 W24 %xdefine W_minus_28 W28 %xdefine W_minus_32 W %endmacro %macro W_PRECALC_ROTATE 0 %xdefine W_minus_32 W_minus_28 %xdefine W_minus_28 W_minus_24 %xdefine W_minus_24 W_minus_20 %xdefine W_minus_20 W_minus_16 %xdefine W_minus_16 W_minus_12 %xdefine W_minus_12 W_minus_08 %xdefine W_minus_08 W_minus_04 %xdefine W_minus_04 W %xdefine W W_minus_32 %endmacro %macro W_PRECALC_00_15 0 ;; message scheduling pre-compute for rounds 0-15 %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqu W_TMP, [BUFFER_PTR + (i * 4)] %elif ((i & 3) == 1) pshufb W_TMP, XMM_SHUFB_BSWAP movdqa W, W_TMP %elif ((i & 3) == 2) paddd W_TMP, [K_BASE] %elif ((i & 3) == 3) movdqa [WK(i&~3)], W_TMP W_PRECALC_ROTATE %endif %endmacro %macro W_PRECALC_16_31 0 ;; message scheduling pre-compute for rounds 16-31 ;; calculating last 32 w[i] values in 8 XMM registers ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction ;; ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency ;; %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqa W, W_minus_12 palignr W, W_minus_16, 8 ;; w[i-14] movdqa W_TMP, W_minus_04 psrldq W_TMP, 4 ;; w[i-3] pxor W, W_minus_08 %elif ((i & 3) == 1) pxor W_TMP, W_minus_16 pxor W, W_TMP movdqa W_TMP2, W movdqa W_TMP, W pslldq W_TMP2, 12 %elif ((i & 3) == 2) psrld W, 31 pslld W_TMP, 1 por W_TMP, W movdqa W, W_TMP2 psrld W_TMP2, 30 pslld W, 2 %elif ((i & 3) == 3) pxor W_TMP, W pxor W_TMP, W_TMP2 movdqa W, W_TMP paddd W_TMP, [K_BASE + K_XMM] movdqa [WK(i&~3)],W_TMP W_PRECALC_ROTATE %endif %endmacro %macro W_PRECALC_32_79 0 ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken ;; %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqa W_TMP, W_minus_04 pxor W, W_minus_28 ;; W is W_minus_32 before xor palignr W_TMP, W_minus_08, 8 %elif ((i & 3) == 1) pxor W, W_minus_16 pxor W, W_TMP movdqa W_TMP, W %elif ((i & 3) == 2) psrld W, 30 pslld W_TMP, 2 por W_TMP, W %elif ((i & 3) == 3) movdqa W, W_TMP paddd W_TMP, [K_BASE + K_XMM] movdqa [WK(i&~3)],W_TMP W_PRECALC_ROTATE %endif %endmacro %macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation ;; TEMP = A ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) ;; C = ROTATE_LEFT( B, 30 ) ;; D = C ;; E = D ;; B = TEMP W_PRECALC (%6 + W_PRECALC_AHEAD) F %2, %3, %4 ;; F returns result in T1 add %5, [WK(%6)] rol %2, 30 mov T2, %1 add %4, [WK(%6 + 1)] rol T2, 5 add %5, T1 W_PRECALC (%6 + W_PRECALC_AHEAD + 1) add T2, %5 mov %5, T2 rol T2, 5 add %4, T2 F %1, %2, %3 ;; F returns result in T1 add %4, T1 rol %1, 30 ;; write: %1, %2 ;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); ; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) ; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 ; invisibile arg 2 : IDX : hash on which lane ; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) ; (sse/avx is 4, avx2 is 8, avx512 is 16) ; ; Clobbers registers: all general regs (except r15), xmm0-xmm10 ; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} ; mk_global sha1_opt_x1, function, internal sha1_opt_x1: endbranch sub rsp, STACK_SPACE mov [rsp + _GPR_SAVE + 8*0], rbx mov [rsp + _GPR_SAVE + 8*1], rbp %ifidn __OUTPUT_FORMAT__, win64 mov [rsp + _GPR_SAVE + 8*2], rdi mov [rsp + _GPR_SAVE + 8*3], rsi ; caller has already stored XMM6~10 %endif mov [rsp + _GPR_SAVE + 8*4], r12 mov [rsp + _GPR_SAVE + 8*5], r13 mov [rsp + _GPR_SAVE + 8*6], r14 mov [rsp + _GPR_SAVE + 8*7], r15 mov [rsp + _GPR_SAVE + 8*8], rdx shl NBLK, 6 ; transform blk amount into bytes jz .lend ; detach idx from nlanx4 mov IDX, NLANX4 shr NLANX4, 8 and IDX, 0xff ;; let sha1_opt sb takes over r8~r11 ;; Load input pointers mov N_MGR, MGR mov BUFFER_PTR, [MGR + _data_ptr + IDX*8] ;; nblk is used to indicate data end add NBLK, BUFFER_PTR mov BUFFER_END, NBLK lea K_BASE, [K_XMM_AR] movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl] REGALLOC lea TMP, [N_MGR + 4*IDX] ;; Initialize digest mov A, [TMP + 0*NLANX4] mov B, [TMP + 1*NLANX4] mov C, [TMP + 2*NLANX4] lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 mov D, [TMP + 1*NLANX4] mov E, [TMP + 2*NLANX4] %assign i 0 %rep W_PRECALC_AHEAD W_PRECALC i %assign i i+1 %endrep %xdefine F F1 .lloop: cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE jmp .lend .lbegin: RR A,B,C,D,E,0 RR D,E,A,B,C,2 RR B,C,D,E,A,4 RR E,A,B,C,D,6 RR C,D,E,A,B,8 RR A,B,C,D,E,10 RR D,E,A,B,C,12 RR B,C,D,E,A,14 RR E,A,B,C,D,16 RR C,D,E,A,B,18 %xdefine F F2 RR A,B,C,D,E,20 RR D,E,A,B,C,22 RR B,C,D,E,A,24 RR E,A,B,C,D,26 RR C,D,E,A,B,28 RR A,B,C,D,E,30 RR D,E,A,B,C,32 RR B,C,D,E,A,34 RR E,A,B,C,D,36 RR C,D,E,A,B,38 %xdefine F F3 RR A,B,C,D,E,40 RR D,E,A,B,C,42 RR B,C,D,E,A,44 RR E,A,B,C,D,46 RR C,D,E,A,B,48 RR A,B,C,D,E,50 RR D,E,A,B,C,52 RR B,C,D,E,A,54 RR E,A,B,C,D,56 RR C,D,E,A,B,58 %xdefine F F4 add BUFFER_PTR, 64 ;; move to next 64-byte block cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration RR A,B,C,D,E,60 RR D,E,A,B,C,62 RR B,C,D,E,A,64 RR E,A,B,C,D,66 RR C,D,E,A,B,68 RR A,B,C,D,E,70 RR D,E,A,B,C,72 RR B,C,D,E,A,74 RR E,A,B,C,D,76 RR C,D,E,A,B,78 lea TMP, [N_MGR + 4*IDX] UPDATE_HASH [TMP + 0*NLANX4],A UPDATE_HASH [TMP + 1*NLANX4],B UPDATE_HASH [TMP + 2*NLANX4],C lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 UPDATE_HASH [TMP + 1*NLANX4],D UPDATE_HASH [TMP + 2*NLANX4],E jmp .lloop .lend: mov MGR, N_MGR mov rdx, [rsp + _GPR_SAVE + 8*8] mov r15, [rsp + _GPR_SAVE + 8*7] mov r14, [rsp + _GPR_SAVE + 8*6] mov r13, [rsp + _GPR_SAVE + 8*5] mov r12, [rsp + _GPR_SAVE + 8*4] %ifidn __OUTPUT_FORMAT__, win64 mov rsi, [rsp + _GPR_SAVE + 8*3] mov rdi, [rsp + _GPR_SAVE + 8*2] %endif mov rbp, [rsp + _GPR_SAVE + 8*1] mov rbx, [rsp + _GPR_SAVE + 8*0] add rsp, STACK_SPACE ret ;;---------------------- section .data align=64 align 128 K_XMM_AR: DD K1, K1, K1, K1 DD K2, K2, K2, K2 DD K3, K3, K3, K3 DD K4, K4, K4, K4 align 16 bswap_shufb_ctl: DD 00010203h DD 04050607h DD 08090a0bh DD 0c0d0e0fh