summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm493
1 files changed, 493 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
new file mode 100644
index 000000000..c02c88eed
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm
@@ -0,0 +1,493 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RDX R10 R11
+;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RDI R10 R11
+;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0 - xmm15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define args arg1
+%define NUM_BLKS arg2
+
+; reso = resdq => 16 bytes
+struc frame
+.ABCD_SAVE reso 1
+.E_SAVE reso 1
+.ABCD_SAVEb reso 1
+.E_SAVEb reso 1
+.align resq 1
+endstruc
+
+%define INP r10
+%define INPb r11
+
+%define ABCD xmm0
+%define E0 xmm1 ; Need two E's b/c they ping pong
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+
+%define ABCDb xmm7
+%define E0b xmm8 ; Need two E's b/c they ping pong
+%define E1b xmm9
+%define MSG0b xmm10
+%define MSG1b xmm11
+%define MSG2b xmm12
+%define MSG3b xmm13
+
+%define SHUF_MASK xmm14
+%define E_MASK xmm15
+
+section .data
+default rel
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f
+ dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000
+ dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks)
+;; arg1 : pointer to args
+;; arg2 : size (in blocks) ;; assumed to be >= 1
+
+section .text
+MKGLOBAL(sha1_ni,function,internal)
+align 32
+sha1_ni:
+ sub rsp, frame_size
+
+ DBGPRINTL "enter sha1-ni-x2"
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+
+ ;; load input pointers
+ mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ]
+ DBGPRINTL64 "jobA: pointer", INP
+ mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ]
+
+ add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition
+
+ ;; load initial digest
+ movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE]
+ pxor E0, E0
+ pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+ pshufd ABCD, ABCD, 0x1B
+
+ DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD
+ DBGPRINTL_XMM "jobA: digest in word 4", E0
+
+ movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE]
+ pxor E0b, E0b
+ pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3
+ pshufd ABCDb, ABCDb, 0x1B
+
+ movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ movdqa E_MASK, [rel UPPER_WORD_MASK]
+
+ DBGPRINTL "jobA data:"
+loop0:
+ ;; Copy digests
+ movdqa [rsp + frame.ABCD_SAVE], ABCD
+ movdqa [rsp + frame.E_SAVE], E0
+ movdqa [rsp + frame.ABCD_SAVEb], ABCDb
+ movdqa [rsp + frame.E_SAVEb], E0b
+
+ ;; Only needed if not using sha1nexte for rounds 0-3
+ pand E0, E_MASK
+ pand E0b, E_MASK
+
+ ;; Needed if using sha1nexte for rounds 0-3
+ ;; Need to rotate E right by 30
+ ;movdqa E1, E0
+ ;psrld E0, 30
+ ;pslld E1, 2
+ ;pxor E0, E1
+
+ ;; Rounds 0-3
+ movdqu MSG0, [INP + 0*16]
+ pshufb MSG0, SHUF_MASK
+ DBGPRINT_XMM MSG0
+ ;sha1nexte E0, MSG0
+ paddd E0, MSG0 ; instead of sha1nexte
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ movdqu MSG0b, [INPb + 0*16]
+ pshufb MSG0b, SHUF_MASK
+ ;sha1nexte E0b, MSG0b
+ paddd E0b, MSG0b ; instead of sha1nexte
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+
+ ;; Rounds 4-7
+ movdqu MSG1, [INP + 1*16]
+ pshufb MSG1, SHUF_MASK
+ DBGPRINT_XMM MSG1
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+ movdqu MSG1b, [INPb + 1*16]
+ pshufb MSG1b, SHUF_MASK
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG0b, MSG1b
+
+ ;; Rounds 8-11
+ movdqu MSG2, [INP + 2*16]
+ pshufb MSG2, SHUF_MASK
+ DBGPRINT_XMM MSG2
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ movdqu MSG2b, [INPb + 2*16]
+ pshufb MSG2b, SHUF_MASK
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 12-15
+ movdqu MSG3, [INP + 3*16]
+ pshufb MSG3, SHUF_MASK
+ DBGPRINT_XMM MSG3
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ movdqu MSG3b, [INPb + 3*16]
+ pshufb MSG3b, SHUF_MASK
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+
+ ;; Rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ;; Rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 3
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ;; Rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 3
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ;; Rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 3
+ pxor MSG3b, MSG1b
+
+ ;; Rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 3
+
+ ;; Rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 3
+
+ ;; Need to rotate E left by 30
+ movdqa E1, E0
+ pslld E0, 30
+ psrld E1, 2
+ pxor E0, E1
+ movdqa E1b, E0b
+ pslld E0b, 30
+ psrld E1b, 2
+ pxor E0b, E1b
+
+ paddd ABCD, [rsp + frame.ABCD_SAVE]
+ paddd E0, [rsp + frame.E_SAVE]
+ paddd ABCDb, [rsp + frame.ABCD_SAVEb]
+ paddd E0b, [rsp + frame.E_SAVEb]
+
+ add INP, 64
+ add INPb, 64
+ cmp INP, NUM_BLKS
+ jne loop0
+
+ ;; write out digests
+ pshufd ABCD, ABCD, 0x1B
+ movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD
+ pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3
+ DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD
+ DBGPRINTL_XMM "jobA: digest out word 4", E0
+
+ pshufd ABCDb, ABCDb, 0x1B
+ movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb
+ pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3
+
+ ;; update input pointers
+ mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP
+ mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb
+
+done_hash:
+
+ ;; Clear stack frame (4*16 bytes)
+%ifdef SAFE_DATA
+ pxor xmm0, xmm0
+%assign i 0
+%rep 4
+ movdqa [rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, frame_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif