diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm | |
parent | Initial commit. (diff) | |
download | ceph-b26c4052f3542036551aa9dec9caa4226e456195.tar.xz ceph-b26c4052f3542036551aa9dec9caa4226e456195.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm')
-rw-r--r-- | src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm | 485 |
1 files changed, 485 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm new file mode 100644 index 000000000..aeb00a008 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm @@ -0,0 +1,485 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 +_WK_SAVE_SIZE equ 16*4 + +_WK_SAVE equ 0 +_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +; rax~rdx, rsi, rdi, rbp are used for RR +%define N_MGR r8 +%define IDX r9 ; local variable -- consistent with caller +%define K_BASE r11 +%define BUFFER_PTR r12 +%define BUFFER_END r13 +%define TMP r14 ; local variable -- assistant to address digest + +%xdefine W_TMP xmm0 +%xdefine W_TMP2 xmm9 + +%xdefine W0 xmm1 +%xdefine W4 xmm2 +%xdefine W8 xmm3 +%xdefine W12 xmm4 +%xdefine W16 xmm5 +%xdefine W20 xmm6 +%xdefine W24 xmm7 +%xdefine W28 xmm8 + +%xdefine XMM_SHUFB_BSWAP xmm10 + +;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer +%xdefine WK(t) (rsp + (t & 15)*4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Constants + +%xdefine K1 0x5a827999 +%xdefine K2 0x6ed9eba1 +%xdefine K3 0x8f1bbcdc +%xdefine K4 0xca62c1d6 + +%xdefine W_PRECALC_AHEAD 16 +%xdefine W_NO_TAIL_PRECALC 0 + +; Rounds macros + +%macro REGALLOC 0 + %xdefine A ecx + %xdefine B esi + %xdefine C edi + %xdefine D ebp + %xdefine E edx + + %xdefine T1 eax + %xdefine T2 ebx +%endmacro + +%macro F1 3 + mov T1,%2 + xor T1,%3 + and T1,%1 + xor T1,%3 +%endmacro + +%macro F2 3 + mov T1,%3 + xor T1,%2 + xor T1,%1 +%endmacro + +%macro F3 3 + mov T1,%2 + mov T2,%1 + or T1,%1 + and T2,%2 + and T1,%3 + or T1,T2 +%endmacro + +%define F4 F2 + +%macro UPDATE_HASH 2 + add %2, %1 + mov %1, %2 +%endmacro + + +%macro W_PRECALC 1 + %xdefine i (%1) + + %if (i < 20) + %xdefine K_XMM 0 + %elif (i < 40) + %xdefine K_XMM 16 + %elif (i < 60) + %xdefine K_XMM 32 + %else + %xdefine K_XMM 48 + %endif + + %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) + + %if (W_NO_TAIL_PRECALC == 0) + + %xdefine i ((%1) % 80) ;; pre-compute for the next iteration + + %if (i == 0) + W_PRECALC_RESET + %endif + + + W_PRECALC_00_15 + %endif + + %elif (i < 32) + W_PRECALC_16_31 + %elif (i < 80) ;; rounds 32-79 + W_PRECALC_32_79 + %endif +%endmacro + +%macro W_PRECALC_RESET 0 + %xdefine W W0 + %xdefine W_minus_04 W4 + %xdefine W_minus_08 W8 + %xdefine W_minus_12 W12 + %xdefine W_minus_16 W16 + %xdefine W_minus_20 W20 + %xdefine W_minus_24 W24 + %xdefine W_minus_28 W28 + %xdefine W_minus_32 W +%endmacro + +%macro W_PRECALC_ROTATE 0 + %xdefine W_minus_32 W_minus_28 + %xdefine W_minus_28 W_minus_24 + %xdefine W_minus_24 W_minus_20 + %xdefine W_minus_20 W_minus_16 + %xdefine W_minus_16 W_minus_12 + %xdefine W_minus_12 W_minus_08 + %xdefine W_minus_08 W_minus_04 + %xdefine W_minus_04 W + %xdefine W W_minus_32 +%endmacro + +%macro W_PRECALC_00_15 0 + ;; message scheduling pre-compute for rounds 0-15 + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqu W_TMP, [BUFFER_PTR + (i * 4)] + %elif ((i & 3) == 1) + pshufb W_TMP, XMM_SHUFB_BSWAP + movdqa W, W_TMP + %elif ((i & 3) == 2) + paddd W_TMP, [K_BASE] + %elif ((i & 3) == 3) + movdqa [WK(i&~3)], W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_16_31 0 + ;; message scheduling pre-compute for rounds 16-31 + ;; calculating last 32 w[i] values in 8 XMM registers + ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction + ;; + ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W, W_minus_12 + palignr W, W_minus_16, 8 ;; w[i-14] + movdqa W_TMP, W_minus_04 + psrldq W_TMP, 4 ;; w[i-3] + pxor W, W_minus_08 + %elif ((i & 3) == 1) + pxor W_TMP, W_minus_16 + pxor W, W_TMP + movdqa W_TMP2, W + movdqa W_TMP, W + pslldq W_TMP2, 12 + %elif ((i & 3) == 2) + psrld W, 31 + pslld W_TMP, 1 + por W_TMP, W + movdqa W, W_TMP2 + psrld W_TMP2, 30 + pslld W, 2 + %elif ((i & 3) == 3) + pxor W_TMP, W + pxor W_TMP, W_TMP2 + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_32_79 0 + ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W_TMP, W_minus_04 + pxor W, W_minus_28 ;; W is W_minus_32 before xor + palignr W_TMP, W_minus_08, 8 + %elif ((i & 3) == 1) + pxor W, W_minus_16 + pxor W, W_TMP + movdqa W_TMP, W + %elif ((i & 3) == 2) + psrld W, 30 + pslld W_TMP, 2 + por W_TMP, W + %elif ((i & 3) == 3) + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation + + ;; TEMP = A + ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) + ;; C = ROTATE_LEFT( B, 30 ) + ;; D = C + ;; E = D + ;; B = TEMP + + W_PRECALC (%6 + W_PRECALC_AHEAD) + F %2, %3, %4 ;; F returns result in T1 + add %5, [WK(%6)] + rol %2, 30 + mov T2, %1 + add %4, [WK(%6 + 1)] + rol T2, 5 + add %5, T1 + + W_PRECALC (%6 + W_PRECALC_AHEAD + 1) + add T2, %5 + mov %5, T2 + rol T2, 5 + add %4, T2 + F %1, %2, %3 ;; F returns result in T1 + add %4, T1 + rol %1, 30 + +;; write: %1, %2 +;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: all general regs (except r15), xmm0-xmm10 +; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} +; +mk_global sha1_opt_x1, function, internal +sha1_opt_x1: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rdi + mov [rsp + _GPR_SAVE + 8*3], rsi + ; caller has already stored XMM6~10 +%endif + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 + mov [rsp + _GPR_SAVE + 8*8], rdx + + + shl NBLK, 6 ; transform blk amount into bytes + jz .lend + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + ;; let sha1_opt sb takes over r8~r11 + ;; Load input pointers + mov N_MGR, MGR + mov BUFFER_PTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, BUFFER_PTR + mov BUFFER_END, NBLK + + lea K_BASE, [K_XMM_AR] + movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl] + + REGALLOC + + lea TMP, [N_MGR + 4*IDX] + ;; Initialize digest + mov A, [TMP + 0*NLANX4] + mov B, [TMP + 1*NLANX4] + mov C, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + mov D, [TMP + 1*NLANX4] + mov E, [TMP + 2*NLANX4] + + %assign i 0 + %rep W_PRECALC_AHEAD + W_PRECALC i + %assign i i+1 + %endrep + + %xdefine F F1 + +.lloop: + cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, + jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE + jmp .lend + +.lbegin: + RR A,B,C,D,E,0 + RR D,E,A,B,C,2 + RR B,C,D,E,A,4 + RR E,A,B,C,D,6 + RR C,D,E,A,B,8 + + RR A,B,C,D,E,10 + RR D,E,A,B,C,12 + RR B,C,D,E,A,14 + RR E,A,B,C,D,16 + RR C,D,E,A,B,18 + + %xdefine F F2 + + RR A,B,C,D,E,20 + RR D,E,A,B,C,22 + RR B,C,D,E,A,24 + RR E,A,B,C,D,26 + RR C,D,E,A,B,28 + + RR A,B,C,D,E,30 + RR D,E,A,B,C,32 + RR B,C,D,E,A,34 + RR E,A,B,C,D,36 + RR C,D,E,A,B,38 + + %xdefine F F3 + + RR A,B,C,D,E,40 + RR D,E,A,B,C,42 + RR B,C,D,E,A,44 + RR E,A,B,C,D,46 + RR C,D,E,A,B,48 + + RR A,B,C,D,E,50 + RR D,E,A,B,C,52 + RR B,C,D,E,A,54 + RR E,A,B,C,D,56 + RR C,D,E,A,B,58 + + %xdefine F F4 + + add BUFFER_PTR, 64 ;; move to next 64-byte block + cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one + cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration + + RR A,B,C,D,E,60 + RR D,E,A,B,C,62 + RR B,C,D,E,A,64 + RR E,A,B,C,D,66 + RR C,D,E,A,B,68 + + RR A,B,C,D,E,70 + RR D,E,A,B,C,72 + RR B,C,D,E,A,74 + RR E,A,B,C,D,76 + RR C,D,E,A,B,78 + + lea TMP, [N_MGR + 4*IDX] + UPDATE_HASH [TMP + 0*NLANX4],A + UPDATE_HASH [TMP + 1*NLANX4],B + UPDATE_HASH [TMP + 2*NLANX4],C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + UPDATE_HASH [TMP + 1*NLANX4],D + UPDATE_HASH [TMP + 2*NLANX4],E + + jmp .lloop + + .lend: + mov MGR, N_MGR + + mov rdx, [rsp + _GPR_SAVE + 8*8] + mov r15, [rsp + _GPR_SAVE + 8*7] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r12, [rsp + _GPR_SAVE + 8*4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + _GPR_SAVE + 8*3] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbp, [rsp + _GPR_SAVE + 8*1] + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + + +;;---------------------- +section .data align=64 + +align 128 +K_XMM_AR: + DD K1, K1, K1, K1 + DD K2, K2, K2, K2 + DD K3, K3, K3, K3 + DD K4, K4, K4, K4 + +align 16 +bswap_shufb_ctl: + DD 00010203h + DD 04050607h + DD 08090a0bh + DD 0c0d0e0fh |