summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm484
1 files changed, 484 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
new file mode 100644
index 000000000..7b0ddb74e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
@@ -0,0 +1,484 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 64 ; space for ABCDE
+%define RSPSAVE rax
+
+%define ABCD xmm0
+; two E's b/c for ping-pong
+%define E0 xmm1
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+
+%define ABCDb xmm7
+%define E0b xmm8 ; Need two E's b/c they ping pong
+%define E1b xmm9
+%define MSG0b xmm10
+%define MSG1b xmm11
+%define MSG2b xmm12
+%define MSG3b xmm13
+
+%define SHUF_MASK xmm14
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define DPTRb r12 ;
+%define TMP r9 ; local variable -- assistant to address digest
+%define TMPb r13 ; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r13, xmm0-xmm14
+;
+mk_global sha1_ni_x2, function, internal
+sha1_ni_x2:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + _args_digest ];
+ lea TMPb,[MGR + _args_digest + 4*1];
+
+ ;; Initialize digest
+ pinsrd ABCD, [TMP + 0*NLANX4], 3
+ pinsrd ABCD, [TMP + 1*NLANX4], 2
+ pinsrd ABCD, [TMP + 2*NLANX4], 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCD, [TMP + 1*NLANX4], 0
+ pinsrd E0, [TMP + 2*NLANX4], 3
+ pand E0, [IDX3_WORD_MASK]
+
+ pinsrd ABCDb, [TMPb + 0*NLANX4], 3
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 2
+ pinsrd ABCDb, [TMPb + 2*NLANX4], 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 0
+ pinsrd E0b, [TMPb + 2*NLANX4], 3
+ pand E0b, [IDX3_WORD_MASK]
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr ]
+ mov DPTRb,[MGR + _data_ptr + 8*1]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ movdqa [rsp + 0*16], E0
+ movdqa [rsp + 1*16], ABCD
+
+ movdqa [rsp + 2*16], E0b
+ movdqa [rsp + 3*16], ABCDb
+
+ ; do rounds 0-3
+ movdqu MSG0, [DPTR + 0*16]
+ pshufb MSG0, SHUF_MASK
+ paddd E0, MSG0
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+
+ movdqu MSG0b, [DPTRb + 0*16]
+ pshufb MSG0b, SHUF_MASK
+ paddd E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+
+ ; do rounds 4-7
+ movdqu MSG1, [DPTR + 1*16]
+ pshufb MSG1, SHUF_MASK
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+
+ movdqu MSG1b, [DPTRb + 1*16]
+ pshufb MSG1b, SHUF_MASK
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG0b, MSG1b
+
+ ; do rounds 8-11
+ movdqu MSG2, [DPTR + 2*16]
+ pshufb MSG2, SHUF_MASK
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ movdqu MSG2b, [DPTRb + 2*16]
+ pshufb MSG2b, SHUF_MASK
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 12-15
+ movdqu MSG3, [DPTR + 3*16]
+ pshufb MSG3, SHUF_MASK
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ movdqu MSG3b, [DPTRb + 3*16]
+ pshufb MSG3b, SHUF_MASK
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 3
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 3
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 3
+ pxor MSG3b, MSG1b
+
+ ; do rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 3
+
+ ; do rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 3
+
+ ; Add current hash values with previously saved
+ sha1nexte E0, [rsp + 0*16]
+ paddd ABCD, [rsp + 1*16]
+
+ sha1nexte E0b, [rsp + 2*16]
+ paddd ABCDb, [rsp + 3*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ add DPTRb, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + _args_digest]
+ pextrd [TMP + 0*NLANX4], ABCD, 3
+ pextrd [TMP + 1*NLANX4], ABCD, 2
+ pextrd [TMP + 2*NLANX4], ABCD, 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], ABCD, 0
+ pextrd [TMP + 2*NLANX4], E0, 3
+
+ lea TMPb, [MGR +_args_digest + 4*1]
+ pextrd [TMPb + 0*NLANX4], ABCDb, 3
+ pextrd [TMPb + 1*NLANX4], ABCDb, 2
+ pextrd [TMPb + 2*NLANX4], ABCDb, 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMPb + 1*NLANX4], ABCDb, 0
+ pextrd [TMPb + 2*NLANX4], E0b, 3
+
+ ; update input pointers
+ mov [MGR + _data_ptr], DPTR
+ mov [MGR + _data_ptr + 8*1], DPTRb
+
+backto_mgr:
+;;;;;;;;;;;;;;;;
+;; Postamble
+
+ mov rsp, RSPSAVE
+
+ ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x2
+no_sha1_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI