diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm new file mode 100644 index 000000000..c02c88eed --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm @@ -0,0 +1,493 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RDX R10 R11 +;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RDI R10 R11 +;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 + +%include "include/os.asm" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +; reso = resdq => 16 bytes +struc frame +.ABCD_SAVE reso 1 +.E_SAVE reso 1 +.ABCD_SAVEb reso 1 +.E_SAVEb reso 1 +.align resq 1 +endstruc + +%define INP r10 +%define INPb r11 + +%define ABCD xmm0 +%define E0 xmm1 ; Need two E's b/c they ping pong +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 + +%define ABCDb xmm7 +%define E0b xmm8 ; Need two E's b/c they ping pong +%define E1b xmm9 +%define MSG0b xmm10 +%define MSG1b xmm11 +%define MSG2b xmm12 +%define MSG3b xmm13 + +%define SHUF_MASK xmm14 +%define E_MASK xmm15 + +section .data +default rel +align 64 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f + dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000 + dq 0x0000000000000000, 0xFFFFFFFF00000000 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 + +section .text +MKGLOBAL(sha1_ni,function,internal) +align 32 +sha1_ni: + sub rsp, frame_size + + DBGPRINTL "enter sha1-ni-x2" + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + + ;; load input pointers + mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ] + DBGPRINTL64 "jobA: pointer", INP + mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ] + + add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition + + ;; load initial digest + movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE] + pxor E0, E0 + pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCD, ABCD, 0x1B + + DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest in word 4", E0 + + movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE] + pxor E0b, E0b + pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCDb, ABCDb, 0x1B + + movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + movdqa E_MASK, [rel UPPER_WORD_MASK] + + DBGPRINTL "jobA data:" +loop0: + ;; Copy digests + movdqa [rsp + frame.ABCD_SAVE], ABCD + movdqa [rsp + frame.E_SAVE], E0 + movdqa [rsp + frame.ABCD_SAVEb], ABCDb + movdqa [rsp + frame.E_SAVEb], E0b + + ;; Only needed if not using sha1nexte for rounds 0-3 + pand E0, E_MASK + pand E0b, E_MASK + + ;; Needed if using sha1nexte for rounds 0-3 + ;; Need to rotate E right by 30 + ;movdqa E1, E0 + ;psrld E0, 30 + ;pslld E1, 2 + ;pxor E0, E1 + + ;; Rounds 0-3 + movdqu MSG0, [INP + 0*16] + pshufb MSG0, SHUF_MASK + DBGPRINT_XMM MSG0 + ;sha1nexte E0, MSG0 + paddd E0, MSG0 ; instead of sha1nexte + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + movdqu MSG0b, [INPb + 0*16] + pshufb MSG0b, SHUF_MASK + ;sha1nexte E0b, MSG0b + paddd E0b, MSG0b ; instead of sha1nexte + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + + ;; Rounds 4-7 + movdqu MSG1, [INP + 1*16] + pshufb MSG1, SHUF_MASK + DBGPRINT_XMM MSG1 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + movdqu MSG1b, [INPb + 1*16] + pshufb MSG1b, SHUF_MASK + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG0b, MSG1b + + ;; Rounds 8-11 + movdqu MSG2, [INP + 2*16] + pshufb MSG2, SHUF_MASK + DBGPRINT_XMM MSG2 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + movdqu MSG2b, [INPb + 2*16] + pshufb MSG2b, SHUF_MASK + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 12-15 + movdqu MSG3, [INP + 3*16] + pshufb MSG3, SHUF_MASK + DBGPRINT_XMM MSG3 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + movdqu MSG3b, [INPb + 3*16] + pshufb MSG3b, SHUF_MASK + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + + ;; Rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 3 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 3 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 3 + pxor MSG3b, MSG1b + + ;; Rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 3 + + ;; Rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 3 + + ;; Need to rotate E left by 30 + movdqa E1, E0 + pslld E0, 30 + psrld E1, 2 + pxor E0, E1 + movdqa E1b, E0b + pslld E0b, 30 + psrld E1b, 2 + pxor E0b, E1b + + paddd ABCD, [rsp + frame.ABCD_SAVE] + paddd E0, [rsp + frame.E_SAVE] + paddd ABCDb, [rsp + frame.ABCD_SAVEb] + paddd E0b, [rsp + frame.E_SAVEb] + + add INP, 64 + add INPb, 64 + cmp INP, NUM_BLKS + jne loop0 + + ;; write out digests + pshufd ABCD, ABCD, 0x1B + movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD + pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3 + DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest out word 4", E0 + + pshufd ABCDb, ABCDb, 0x1B + movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb + pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3 + + ;; update input pointers + mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb + +done_hash: + + ;; Clear stack frame (4*16 bytes) +%ifdef SAFE_DATA + pxor xmm0, xmm0 +%assign i 0 +%rep 4 + movdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, frame_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |