diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm new file mode 100644 index 000000000..86d09e303 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm @@ -0,0 +1,318 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 32 ; space for ABCDE +%define RSPSAVE rax + +%define ABCD xmm0 +; two E's b/c for ping-pong +%define E0 xmm1 +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 +%define SHUF_MASK xmm7 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define TMP r9 ; local variable -- assistant to address digest +;%define TMP2 r8 ; local variable -- assistant to address digest +align 32 + +; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r11, xmm0-xmm7 +; +mk_global sha1_ni_x1, function, internal +sha1_ni_x1: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*IDX] + ;; Initialize digest + pinsrd ABCD, [TMP + 0*NLANX4], 3 + pinsrd ABCD, [TMP + 1*NLANX4], 2 + pinsrd ABCD, [TMP + 2*NLANX4], 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCD, [TMP + 1*NLANX4], 0 + pinsrd E0, [TMP + 2*NLANX4], 3 + pand E0, [IDX3_WORD_MASK] + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; Save hash values for addition after rounds + movdqa [rsp + 0*16], E0 + movdqa [rsp + 1*16], ABCD + + ; do rounds 0-3 + movdqu MSG0, [DPTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + ; do rounds 4-7 + movdqu MSG1, [DPTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + ; do rounds 8-11 + movdqu MSG2, [DPTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 12-15 + movdqu MSG3, [DPTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + ; do rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + ; do rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + ; Add current hash values with previously saved + sha1nexte E0, [rsp + 0*16] + paddd ABCD, [rsp + 1*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*IDX] + pextrd [TMP + 0*NLANX4], ABCD, 3 + pextrd [TMP + 1*NLANX4], ABCD, 2 + pextrd [TMP + 2*NLANX4], ABCD, 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], ABCD, 0 + pextrd [TMP + 2*NLANX4], E0, 3 + + ; update input pointers + mov [MGR + _data_ptr + IDX*8], DPTR + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + + mov rsp, RSPSAVE + + ret + + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_ni_x1 +no_sha1_ni_x1: +%endif +%endif ; HAVE_AS_KNOWS_SHANI |