diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm new file mode 100644 index 00000000..e1c21e20 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm @@ -0,0 +1,496 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; SHA1 code, hybrid, rolled, interleaved +; Uses AVX instructions +%include "os.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%ifdef LINUX +%define INP rdi ; 1st arg +%define CTX rsi ; 2nd arg +%define REG3 ecx +%define REG4 edx +%else +%define INP rcx ; 1st arg +%define CTX rdx ; 2nd arg +%define REG3 edi +%define REG4 esi +%endif + +%define FRAMESZ 3*16 + 1*8 +%define _RSP FRAMESZ-1*8 + rsp + +%define a eax +%define b ebx +%define c REG3 +%define d REG4 +%define e r8d +%define T1 r9d +%define f r10d +%define RND r11d +%define g r12d +%define h r13d + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XK xmm2 + +%xdefine X0 xmm3 +%xdefine X1 xmm4 +%xdefine X2 xmm5 +%xdefine X3 xmm6 +%xdefine X4 xmm7 + +%define XFER xmm8 + +%define SZ 4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X_ +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regC + xor %%regF,%%regD + and %%regF,%%regB + xor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regD + xor %%regF,%%regC + xor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regB + mov %%regT,%%regB + or %%regF,%%regC + and %%regT,%%regC + and %%regF,%%regD + or %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +;; input is T1 +%macro ROUND 1 +%define %%MAGIC %1 + add e,T1 + mov T1,a + rol T1,5 + add e,T1 + %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + rol b,30 + add h,e +ROTATE_ARGS +%endmacro + +%macro do_4i 1 + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ;ROUND %1 + add e,T1 + ;SCHEDULE_4 + vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14] + mov T1,a + rol T1,5 + vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16] + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + + ;; Finish low half + rol b,30 + vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA} + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 1 + ;ROUND %1 + add e,T1 + vpxor X4, X4, XTMP0 + mov T1,a + rol T1,5 + ;; rotate X4 left 1 + vpsrld XTMP1, X4, (32-1) + add e,T1 + vpslld X4, X4, 1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA} + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 2 + ;ROUND %1 + add e,T1 + mov T1,a + + ;; Finish high half + vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx} + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; rotate XTMP0 left 1 + vpsrld XTMP1, XTMP0, (32-1) + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 3 + ;ROUND %1 + add e,T1 + mov T1,a + vpslld XTMP0, XTMP0, 1 + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; COMBINE HALVES + vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} + rol b,30 + add h,e + + rotate_Xs +ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_one_block_avx(void *input_data, UINT32 digest[8] +;; arg 1 : rcx : pointer to input data +;; arg 2 : rdx : pointer to digest +MKGLOBAL(sha1_one_block_avx,function,) +align 32 +sha1_one_block_avx: + push rbx + push rsi + push rdi + push r12 + push r13 + + ;; byte swap first 16 dwords + vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] + mov rax,rsp ; copy rsp + VMOVDQ X0, [INP + 0*16] + sub rsp,FRAMESZ + VMOVDQ X1, [INP + 1*16] + and rsp,-64 ; align stack frame + + vmovdqa [rsp + 0 * 16], xmm6 + vmovdqa [rsp + 1 * 16], xmm7 + vmovdqa [rsp + 2 * 16], xmm8 + + VMOVDQ X2, [INP + 2*16] + mov [_RSP],rax ; save copy of rsp + VMOVDQ X3, [INP + 3*16] + ;; load initial digest + mov a,0x67452301 + vpshufb X0, XTMP0 + mov b,0xefcdab89 + vpshufb X1, XTMP0 + mov c,0x98badcfe + vpshufb X2, XTMP0 + mov d,0x10325476 + vpshufb X3, XTMP0 + mov e,0xc3d2e1f0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K00_19] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop1_5 +align 16 +loop1: + + do_4i MAGIC_F0 + +loop1_5: + do_4i MAGIC_F0 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K20_39] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop2_5 +align 16 +loop2: + + do_4i MAGIC_F1 + +loop2_5: + do_4i MAGIC_F1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K40_59] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop3_5 +align 16 +loop3: + + do_4i MAGIC_F2 + +loop3_5: + do_4i MAGIC_F2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop3 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;; do rounds 60-79 + vmovdqa XK, [rel K60_79] + + do_4i MAGIC_F3 + + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X1 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X2 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X3 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + add a,0x67452301 + mov [SZ*0 + CTX], a + add b,0xefcdab89 + mov [SZ*1 + CTX], b + add c,0x98badcfe + mov [SZ*2 + CTX], c + add d,0x10325476 + mov [SZ*3 + CTX], d + add e,0xc3d2e1f0 + mov [SZ*4 + CTX], e + + vmovdqa xmm8, [rsp + 2 * 16] + vmovdqa xmm7, [rsp + 1 * 16] + vmovdqa xmm6, [rsp + 0 * 16] + + mov rsp,[_RSP] + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |