;; ;; Copyright (c) 2012-2018, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; SHA1 code, hybrid, rolled, interleaved ; Uses SSE instructions %include "os.asm" section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text %define MOVDQ movdqu ;; assume buffers not aligned %ifdef LINUX %define INP rdi ; 1st arg %define CTX rsi ; 2nd arg %define REG3 ecx %define REG4 edx %else %define INP rcx ; 1st arg %define CTX rdx ; 2nd arg %define REG3 edi %define REG4 esi %endif %define FRAMESZ 3*16 + 1*8 %define _RSP FRAMESZ-1*8 + rsp %define a eax %define b ebx %define c REG3 %define d REG4 %define e r8d %define T1 r9d %define f r10d %define RND r11d %define g r12d %define h r13d %define XTMP0 xmm0 %define XTMP1 xmm1 %define XK xmm2 %xdefine X0 xmm3 %xdefine X1 xmm4 %xdefine X2 xmm5 %xdefine X3 xmm6 %xdefine X4 xmm7 %define XFER xmm8 %define SZ 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X4 %xdefine X4 X_ %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ;; Magic functions defined in FIPS 180-1 ;; ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regC xor %%regF,%%regD and %%regF,%%regB xor %%regF,%%regD %endmacro ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regD xor %%regF,%%regC xor %%regF,%%regB %endmacro ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regB mov %%regT,%%regB or %%regF,%%regC and %%regT,%%regC and %%regF,%%regD or %%regF,%%regT %endmacro ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ;; input is T1 %macro ROUND 1 %define %%MAGIC %1 add e,T1 mov T1,a rol T1,5 add e,T1 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) rol b,30 add h,e ROTATE_ARGS %endmacro %macro do_4i 1 movdqa XFER, XK paddd XFER, X0 pextrd T1, XFER, 0 ;ROUND %1 add e,T1 ;SCHEDULE_4 movdqa XTMP0, X1 palignr XTMP0, X0, 8 ; XTMP0 = W[-14] mov T1,a movdqa XTMP1, X2 rol T1,5 pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16] add e,T1 pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; Finish low half movdqa X4, X3 rol b,30 psrldq X4, 4 ; X4 = W[-3] {xxBA} add h,e ROTATE_ARGS pextrd T1, XFER, 1 ;ROUND %1 add e,T1 pxor X4, XTMP0 ; mov T1,a movdqa XTMP1, X4 rol T1,5 ;; rotate X4 left 1 psrld XTMP1, (32-1) add e,T1 pslld X4, 1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) pxor X4, XTMP1 ; X4 = W[0] {xxBA} rol b,30 add h,e ROTATE_ARGS pextrd T1, XFER, 2 ;ROUND %1 add e,T1 movdqa XTMP1, X4 mov T1,a ;; Finish high half palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx} rol T1,5 add e,T1 pxor XTMP0, XTMP1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; rotate XTMP0 left 1 movdqa XTMP1, XTMP0 psrld XTMP1, (32-1) rol b,30 add h,e ROTATE_ARGS pextrd T1, XFER, 3 ;ROUND %1 add e,T1 mov T1,a pslld XTMP0, 1 rol T1,5 add e,T1 pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; COMBINE HALVES shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} rol b,30 add h,e rotate_Xs ROTATE_ARGS %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha1_one_block_sse(void *input_data, UINT32 digest[8] ;; arg 1 : rcx : pointer to input data ;; arg 2 : rdx : pointer to digest MKGLOBAL(sha1_one_block_sse,function,) align 32 sha1_one_block_sse: push rbx push rsi push rdi push r12 push r13 ;; byte swap first 16 dwords movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] mov rax,rsp ; copy rsp MOVDQ X0, [INP + 0*16] sub rsp,FRAMESZ MOVDQ X1, [INP + 1*16] and rsp,-64 ; align stack frame movdqa [rsp + 0 * 16], xmm6 movdqa [rsp + 1 * 16], xmm7 movdqa [rsp + 2 * 16], xmm8 MOVDQ X2, [INP + 2*16] mov [_RSP],rax ; save copy of rsp MOVDQ X3, [INP + 3*16] ;; load initial digest mov a,0x67452301 pshufb X0, XTMP0 mov b,0xefcdab89 pshufb X1, XTMP0 mov c,0x98badcfe pshufb X2, XTMP0 mov d,0x10325476 pshufb X3, XTMP0 mov e,0xc3d2e1f0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K00_19] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop1_5 align 16 loop1: do_4i MAGIC_F0 loop1_5: do_4i MAGIC_F0 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K20_39] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop2_5 align 16 loop2: do_4i MAGIC_F1 loop2_5: do_4i MAGIC_F1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K40_59] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop3_5 align 16 loop3: do_4i MAGIC_F2 loop3_5: do_4i MAGIC_F2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop3 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 60-79 movdqa XK, [rel K60_79] do_4i MAGIC_F3 movdqa XFER, XK paddd XFER, X0 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X1 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X2 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X3 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 add a,0x67452301 mov [SZ*0 + CTX], a add b,0xefcdab89 mov [SZ*1 + CTX], b add c,0x98badcfe mov [SZ*2 + CTX], c add d,0x10325476 mov [SZ*3 + CTX], d add e,0xc3d2e1f0 mov [SZ*4 + CTX], e movdqa xmm8, [rsp + 2 * 16] movdqa xmm7, [rsp + 1 * 16] movdqa xmm6, [rsp + 0 * 16] mov rsp,[_RSP] pop r13 pop r12 pop rdi pop rsi pop rbx ret %ifdef LINUX section .note.GNU-stack noalloc noexec nowrite progbits %endif