summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm507
1 files changed, 507 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
new file mode 100644
index 00000000..aaa5a58f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm
@@ -0,0 +1,507 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; SHA1 code, hybrid, rolled, interleaved
+; Uses SSE instructions
+%include "os.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+
+%ifdef LINUX
+%define INP rdi ; 1st arg
+%define CTX rsi ; 2nd arg
+%define REG3 ecx
+%define REG4 edx
+%else
+%define INP rcx ; 1st arg
+%define CTX rdx ; 2nd arg
+%define REG3 edi
+%define REG4 esi
+%endif
+
+%define FRAMESZ 3*16 + 1*8
+%define _RSP FRAMESZ-1*8 + rsp
+
+%define a eax
+%define b ebx
+%define c REG3
+%define d REG4
+%define e r8d
+%define T1 r9d
+%define f r10d
+%define RND r11d
+%define g r12d
+%define h r13d
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XK xmm2
+
+%xdefine X0 xmm3
+%xdefine X1 xmm4
+%xdefine X2 xmm5
+%xdefine X3 xmm6
+%xdefine X4 xmm7
+
+%define XFER xmm8
+
+%define SZ 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X_
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regC
+ xor %%regF,%%regD
+ and %%regF,%%regB
+ xor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regD
+ xor %%regF,%%regC
+ xor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regB
+ mov %%regT,%%regB
+ or %%regF,%%regC
+ and %%regT,%%regC
+ and %%regF,%%regD
+ or %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+;; input is T1
+%macro ROUND 1
+%define %%MAGIC %1
+ add e,T1
+ mov T1,a
+ rol T1,5
+ add e,T1
+ %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+%endmacro
+
+%macro do_4i 1
+ movdqa XFER, XK
+ paddd XFER, X0
+ pextrd T1, XFER, 0
+ ;ROUND %1
+ add e,T1
+ ;SCHEDULE_4
+ movdqa XTMP0, X1
+ palignr XTMP0, X0, 8 ; XTMP0 = W[-14]
+ mov T1,a
+ movdqa XTMP1, X2
+ rol T1,5
+ pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16]
+ add e,T1
+ pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+
+ ;; Finish low half
+ movdqa X4, X3
+ rol b,30
+ psrldq X4, 4 ; X4 = W[-3] {xxBA}
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 1
+ ;ROUND %1
+ add e,T1
+ pxor X4, XTMP0 ;
+ mov T1,a
+ movdqa XTMP1, X4
+ rol T1,5
+ ;; rotate X4 left 1
+ psrld XTMP1, (32-1)
+ add e,T1
+ pslld X4, 1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ pxor X4, XTMP1 ; X4 = W[0] {xxBA}
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 2
+ ;ROUND %1
+ add e,T1
+ movdqa XTMP1, X4
+ mov T1,a
+
+ ;; Finish high half
+ palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx}
+ rol T1,5
+ add e,T1
+ pxor XTMP0, XTMP1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; rotate XTMP0 left 1
+ movdqa XTMP1, XTMP0
+ psrld XTMP1, (32-1)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ pextrd T1, XFER, 3
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+ pslld XTMP0, 1
+ rol T1,5
+ add e,T1
+ pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; COMBINE HALVES
+ shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
+ rol b,30
+ add h,e
+
+ rotate_Xs
+ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_one_block_sse(void *input_data, UINT32 digest[8]
+;; arg 1 : rcx : pointer to input data
+;; arg 2 : rdx : pointer to digest
+MKGLOBAL(sha1_one_block_sse,function,)
+align 32
+sha1_one_block_sse:
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+
+ ;; byte swap first 16 dwords
+ movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ mov rax,rsp ; copy rsp
+ MOVDQ X0, [INP + 0*16]
+ sub rsp,FRAMESZ
+ MOVDQ X1, [INP + 1*16]
+ and rsp,-64 ; align stack frame
+
+ movdqa [rsp + 0 * 16], xmm6
+ movdqa [rsp + 1 * 16], xmm7
+ movdqa [rsp + 2 * 16], xmm8
+
+ MOVDQ X2, [INP + 2*16]
+ mov [_RSP],rax ; save copy of rsp
+ MOVDQ X3, [INP + 3*16]
+ ;; load initial digest
+ mov a,0x67452301
+ pshufb X0, XTMP0
+ mov b,0xefcdab89
+ pshufb X1, XTMP0
+ mov c,0x98badcfe
+ pshufb X2, XTMP0
+ mov d,0x10325476
+ pshufb X3, XTMP0
+ mov e,0xc3d2e1f0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K00_19]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop1_5
+align 16
+loop1:
+
+ do_4i MAGIC_F0
+
+loop1_5:
+ do_4i MAGIC_F0
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K20_39]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop2_5
+align 16
+loop2:
+
+ do_4i MAGIC_F1
+
+loop2_5:
+ do_4i MAGIC_F1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa XK, [rel K40_59]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop3_5
+align 16
+loop3:
+
+ do_4i MAGIC_F2
+
+loop3_5:
+ do_4i MAGIC_F2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ movdqa X0, X2
+ movdqa X2, X4
+ movdqa X4, X1
+ movdqa X1, X3
+
+ sub RND, 1
+ jne loop3
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+ ;; do rounds 60-79
+ movdqa XK, [rel K60_79]
+
+ do_4i MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X0
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X1
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X2
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ movdqa XFER, XK
+ paddd XFER, X3
+ pextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ pextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ add a,0x67452301
+ mov [SZ*0 + CTX], a
+ add b,0xefcdab89
+ mov [SZ*1 + CTX], b
+ add c,0x98badcfe
+ mov [SZ*2 + CTX], c
+ add d,0x10325476
+ mov [SZ*3 + CTX], d
+ add e,0xc3d2e1f0
+ mov [SZ*4 + CTX], e
+
+ movdqa xmm8, [rsp + 2 * 16]
+ movdqa xmm7, [rsp + 1 * 16]
+ movdqa xmm6, [rsp + 0 * 16]
+
+ mov rsp,[_RSP]
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif