summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/sha1_mb
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto/isa-l/isa-l_crypto/sha1_mb
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sha1_mb')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am130
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S294
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c93
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S192
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c208
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S194
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c265
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c264
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c281
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c325
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c251
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c259
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm67
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm278
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm290
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c159
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c297
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c233
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c128
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c132
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm563
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm416
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm413
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm518
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c112
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm131
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm318
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm484
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm485
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c220
53 files changed, 12431 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
new file mode 100644
index 000000000..3f3c589ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
@@ -0,0 +1,130 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sha1_mb/sha1_ctx_sse.c \
+ sha1_mb/sha1_ctx_avx.c \
+ sha1_mb/sha1_ctx_avx2.c \
+ sha1_mb/sha1_ctx_base.c
+
+lsrc_x86_64 += sha1_mb/sha1_mb_mgr_init_sse.c \
+ sha1_mb/sha1_mb_mgr_init_avx2.c
+
+lsrc_x86_64 += sha1_mb/sha1_mb_mgr_submit_sse.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx2.asm \
+ sha1_mb/sha1_mb_mgr_flush_sse.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx2.asm \
+ sha1_mb/sha1_mb_x4_sse.asm \
+ sha1_mb/sha1_mb_x4_avx.asm \
+ sha1_mb/sha1_mb_x8_avx2.asm \
+ sha1_mb/sha1_multibinary.asm
+
+lsrc_x86_64 += sha1_mb/sha1_ctx_avx512.c \
+ sha1_mb/sha1_mb_mgr_init_avx512.c \
+ sha1_mb/sha1_mb_mgr_submit_avx512.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx512.asm \
+ sha1_mb/sha1_mb_x16_avx512.asm
+
+lsrc_x86_64 += sha1_mb/sha1_opt_x1.asm
+
+lsrc_x86_64 += sha1_mb/sha1_ni_x1.asm \
+ sha1_mb/sha1_ni_x2.asm \
+ sha1_mb/sha1_ctx_sse_ni.c \
+ sha1_mb/sha1_ctx_avx512_ni.c \
+ sha1_mb/sha1_mb_mgr_submit_sse_ni.asm \
+ sha1_mb/sha1_mb_mgr_flush_sse_ni.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += sha1_mb/sha1_ctx_base.c \
+ sha1_mb/sha1_ref.c \
+ sha1_mb/aarch64/sha1_mb_multibinary.S \
+ sha1_mb/aarch64/sha1_ctx_ce.c \
+ sha1_mb/aarch64/sha1_mb_x1_ce.S \
+ sha1_mb/aarch64/sha1_mb_x2_ce.S \
+ sha1_mb/aarch64/sha1_mb_mgr_ce.c \
+ sha1_mb/aarch64/sha1_ctx_asimd.c \
+ sha1_mb/aarch64/sha1_aarch64_x1.S \
+ sha1_mb/aarch64/sha1_mb_asimd_x4.S \
+ sha1_mb/aarch64/sha1_mb_mgr_asimd.c \
+ sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
+
+
+
+lsrc_base_aliases += sha1_mb/sha1_ctx_base_aliases.c \
+ sha1_mb/sha1_ctx_base.c \
+ sha1_mb/sha1_ref.c
+
+src_include += -I $(srcdir)/sha1_mb
+
+extern_hdrs += include/sha1_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha1_mb/sha1_job.asm \
+ sha1_mb/sha1_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha1_mb/sha1_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha1_mb/sha1_mb_test \
+ sha1_mb/sha1_mb_rand_test \
+ sha1_mb/sha1_mb_rand_update_test \
+ sha1_mb/sha1_mb_flush_test
+
+unit_tests += sha1_mb/sha1_mb_rand_ssl_test
+
+perf_tests += sha1_mb/sha1_mb_vs_ossl_perf \
+ sha1_mb/sha1_mb_vs_ossl_shortage_perf
+
+examples += sha1_mb/sha1_multi_buffer_example
+
+
+sha1_mb_rand_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_update_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_flush_test: sha1_ref.o
+sha1_mb_sha1_mb_flush_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
new file mode 100644
index 000000000..55d6f932f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
@@ -0,0 +1,294 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+ input_data .req x0
+ num_blocks .req w1
+ digest .req x2
+
+ // x2 is reused intentionally between digest/tmp
+ // due to running out of registers
+ TMP .req x2
+ TMPW .req w2
+ sha1key_adr .req x3
+ WK .req w3
+ WF .req w4
+ WA .req w5
+ WB .req w6
+ WC .req w7
+ WD .req w8
+ WE .req w9
+ WORD0 .req w10
+ WORD1 .req w11
+ WORD2 .req w12
+ WORD3 .req w13
+ WORD4 .req w14
+ WORD5 .req w15
+ WORD6 .req w16
+ WORD7 .req w17
+ WORD8 .req w18
+ WORD9 .req w19
+ WORD10 .req w20
+ WORD11 .req w21
+ WORD12 .req w22
+ WORD13 .req w23
+ WORD14 .req w24
+ WORD15 .req w25
+ AA .req w26
+ BB .req w27
+ CC .req w28
+ DD .req w29
+ EE .req w30
+
+ TT .req w0
+
+.macro save_stack
+ stp x16,x17,[sp, -128]!
+ stp x18,x19,[sp, 16]
+ stp x20,x21,[sp, 32]
+ stp x22,x23,[sp, 48]
+ stp x24,x25,[sp, 64]
+ stp x26,x27,[sp, 80]
+ stp x28,x29,[sp, 96]
+ str x30,[sp, 112]
+ // have to reuse x2, which is digest address
+ str x2,[sp, 120]
+.endm
+
+.macro restore_stack
+ ldp x18,x19,[sp, 16]
+ ldp x20,x21,[sp, 32]
+ ldp x22,x23,[sp, 48]
+ ldp x24,x25,[sp, 64]
+ ldp x26,x27,[sp, 80]
+ ldp x28,x29,[sp, 96]
+ ldr x30,[sp, 112]
+ ldr x2,[sp, 120]
+ ldp x16,x17,[sp],128
+.endm
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor WF, WC, WD
+ and WF, WB, WF
+ eor WF, WD, WF
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor WF, WB, WC
+ eor WF, WF, WD
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and TMPW, WB, WC
+ and WF, WB, WD
+ orr WF, WF, TMPW
+ and TMPW, WC, WD
+ orr WF, WF, TMPW
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_word_at \windex
+ .endif
+.endm
+
+.macro SHA1_STEP_00_15 windex:req
+ rev WORD\windex\(),WORD\windex\()
+ next_word=\windex+1
+ load_next_word %next_word
+
+ ror TMPW,WA,#32-5
+ add WE,WE,TMPW
+ add WE,WE,WK
+ FUNC_F0
+ ror WB,WB,#32-30
+ add WE,WE,WORD\windex\()
+ add WE,WE,WF
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor TMPW,\reg_14,\reg_8
+ eor \reg_16,\reg_16,\reg_3
+ eor \reg_16,\reg_16,TMPW
+
+ ror TMPW,WA,#32-5
+ ror \reg_16,\reg_16, #32 - 1
+
+ add WE,WE,TMPW
+ add WE,WE,WK
+ \func_f
+ ror WB,WB,#32-30
+ add WE,WE,\reg_16
+ add WE,WE,WF
+.endm
+
+.macro SWAP_STATES
+ .unreq TT
+ TT .req WE
+ .unreq WE
+ WE .req WD
+ .unreq WD
+ WD .req WC
+ .unreq WC
+ WC .req WB
+ .unreq WB
+ WB .req WA
+ .unreq WA
+ WA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.altmacro
+
+.macro load_two_words_at idx0:req,idx1:req
+ ldp WORD\idx0\(),WORD\idx1\(),[input_data],8
+.endm
+
+.macro load_word_at idx:req
+ .if \idx % 2 == 0
+ idx1=\idx+1
+ load_two_words_at \idx,%idx1
+ .endif
+.endm
+
+/*
+ * void sha1_aarch64_x1(uint32_t *input_data, int num_blocks, uint32_t digest[5])
+ */
+ .global sha1_aarch64_x1
+ .type sha1_aarch64_x1, %function
+sha1_aarch64_x1:
+ cmp num_blocks, #0
+ beq .return
+
+ ldp WA,WB,[digest]
+ ldp WC,WD,[digest,8]
+ ldr WE,[digest,16]
+ save_stack
+
+.block_loop:
+ mov AA, WA
+ mov BB, WB
+ mov CC, WC
+ mov DD, WD
+ mov EE, WE
+
+ load_word_at 0
+
+ adr sha1key_adr, KEY_0
+ ldr WK, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ldr WK, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ldr WK, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ldr WK, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add WA, AA, WA
+ add WB, BB, WB
+ add WC, CC, WC
+ add WD, DD, WD
+ add WE, EE, WE
+
+ subs num_blocks, num_blocks, 1
+ bne .block_loop
+
+ restore_stack
+ stp WA,WB,[digest]
+ stp WC,WD,[digest,8]
+ str WE,[digest,16]
+
+.return:
+ ret
+
+ .size sha1_aarch64_x1, .-sha1_aarch64_x1
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..c8b8dd982
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor VF.16b, VC.16b, VD.16b
+ and VF.16b, VB.16b, VF.16b
+ eor VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor VF.16b, VB.16b, VC.16b
+ eor VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and vT0.16b, VB.16b, VC.16b
+ and vT1.16b, VB.16b, VD.16b
+ and vT2.16b, VC.16b, VD.16b
+ orr VF.16b, vT0.16b, vT1.16b
+ orr VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_x4_word \windex
+ .endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+ rev32 WORD\windex\().16b,WORD\windex\().16b
+ next_word=\windex+1
+ load_next_word %next_word
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, VA.4s, 32 - 5
+ add VE.4s, VE.4s, VK.4s
+ sli VT.4s, VA.4s, 5
+ eor VF.16b, VC.16b, VD.16b
+ add VE.4s, VE.4s, WORD\windex\().4s
+ and VF.16b, VB.16b, VF.16b
+ add VE.4s, VE.4s, VT.4s
+ eor VF.16b, VD.16b, VF.16b
+ ushr VT.4s, VB.4s, 32 - 30
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor vT0.16b,\reg_3\().16b,\reg_8\().16b
+ eor VT.16b,\reg_14\().16b,\reg_16\().16b
+ eor vT0.16b,vT0.16b,VT.16b
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, vT0.4s, 32 - 1
+ add VE.4s, VE.4s, VK.4s
+ ushr vT1.4s, VA.4s, 32 - 5
+ sli VT.4s, vT0.4s, 1
+ add VE.4s, VE.4s, VT.4s
+ sli vT1.4s, VA.4s, 5
+ mov \reg_16\().16b,VT.16b
+ add VE.4s, VE.4s, vT1.4s
+ ushr VT.4s, VB.4s, 32 - 30
+ \func_f
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+ VA .req v0
+ VB .req v1
+ VC .req v2
+ VD .req v3
+ VE .req v4
+ VT .req v5
+ VF .req v6
+ VK .req v7
+ WORD0 .req v8
+ WORD1 .req v9
+ WORD2 .req v10
+ WORD3 .req v11
+ WORD4 .req v12
+ WORD5 .req v13
+ WORD6 .req v14
+ WORD7 .req v15
+ WORD8 .req v16
+ WORD9 .req v17
+ WORD10 .req v18
+ WORD11 .req v19
+ WORD12 .req v20
+ WORD13 .req v21
+ WORD14 .req v22
+ WORD15 .req v23
+ vT0 .req v24
+ vT1 .req v25
+ vT2 .req v26
+ vAA .req v27
+ vBB .req v28
+ vCC .req v29
+ vDD .req v30
+ vEE .req v31
+ TT .req v0
+ sha1key_adr .req x15
+
+.macro SWAP_STATES
+ // shifted VB is held in VT after each step
+ .unreq TT
+ TT .req VE
+ .unreq VE
+ VE .req VD
+ .unreq VD
+ VD .req VC
+ .unreq VC
+ VC .req VT
+ .unreq VT
+ VT .req VB
+ .unreq VB
+ VB .req VA
+ .unreq VA
+ VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15_F0 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+
+ .if \windex == 79
+ // after 80 steps, the registers ABCDET has shifted from
+ // its orignal order of 012345 to 341520
+ // have to swap back for both compile- and run-time correctness
+ mov v0.16b,v3.16b
+ .unreq VA
+ VA .req v0
+
+ mov vT0.16b,v2.16b
+ mov v2.16b,v1.16b
+ mov v1.16b,v4.16b
+ .unreq VB
+ VB .req v1
+ .unreq VC
+ VC .req v2
+
+ mov v3.16b,v5.16b
+ .unreq VD
+ VD .req v3
+
+ mov v4.16b,vT0.16b
+ .unreq VE
+ VE .req v4
+
+ .unreq VT
+ VT .req v5
+ .endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.macro sha1_single
+ load_x4_word 0
+
+ mov vAA.16B, VA.16B
+ mov vBB.16B, VB.16B
+ mov vCC.16B, VC.16B
+ mov vDD.16B, VD.16B
+ mov vEE.16B, VE.16B
+
+ adr sha1key_adr, KEY_0
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add VA.4s, vAA.4s, VA.4s
+ add VB.4s, vBB.4s, VB.4s
+ add VC.4s, vCC.4s, VC.4s
+ add VD.4s, vDD.4s, VD.4s
+ add VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+ stp d8,d9,[sp, -64]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+ stp d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d14,d15,[sp, 48]
+ ldp d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
new file mode 100644
index 000000000..9a9952ff6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_asimd(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_asimd(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_asimd_slver_02020142;
+struct slver sha1_ctx_mgr_init_asimd_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_asimd_slver_02020143;
+struct slver sha1_ctx_mgr_submit_asimd_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_asimd_slver_02020144;
+struct slver sha1_ctx_mgr_flush_asimd_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
new file mode 100644
index 000000000..e40a344ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_ce(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_ce(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_ce_slver_02020142;
+struct slver sha1_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha1_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha1_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..0942c1a95
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
@@ -0,0 +1,93 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_submit)
+{
+
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_submit_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_submit_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_init_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_init_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_flush_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_flush_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
new file mode 100644
index 000000000..012b15c14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
@@ -0,0 +1,192 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+#include "sha1_asimd_common.S"
+
+.macro internal_load windex
+ // load 64-bytes from each address to maximize usage of cache line
+ .if \windex == 0
+ mov tmp,dataptr
+ ld1 {WORD0.4s},[data0],16
+ ld1 {WORD4.4s},[data0],16
+ ld1 {WORD8.4s},[data0],16
+ ld1 {WORD12.4s},[data0],16
+
+ ld1 {WORD1.4s},[data1],16
+ ld1 {WORD5.4s},[data1],16
+ ld1 {WORD9.4s},[data1],16
+ ld1 {WORD13.4s},[data1],16
+
+ ld1 {WORD2.4s},[data2],16
+ ld1 {WORD6.4s},[data2],16
+ ld1 {WORD10.4s},[data2],16
+ ld1 {WORD14.4s},[data2],16
+
+ ld1 {WORD3.4s},[data3],16
+ ld1 {WORD7.4s},[data3],16
+ ld1 {WORD11.4s},[data3],16
+ ld1 {WORD15.4s},[data3],16
+
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[0],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[1],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[2],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 4
+ mov tmp,dataptr
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[0],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[1],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[2],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 8
+ mov tmp,dataptr
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[0],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[1],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[2],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 12
+ mov tmp,dataptr
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[0],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[1],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[2],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[3],[tmp],16
+ .endif
+.endm
+
+.macro load_x4_word idx:req
+ internal_load \idx
+ ld1 {WORD\idx\().16b},[dataptr],16
+.endm
+
+/*
+ * void sha1_mb_asimd_x4(SHA1_JOB *j0, SHA1_JOB*j1, SHA1_JOB*j2, SHA1_JOB *j3, int blocks)
+ */
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ num_blocks .req w4
+ tmp .req x5
+ data0 .req x6
+ data1 .req x7
+ data2 .req x8
+ data3 .req x9
+ databuf .req x10
+ dataptr .req x11
+ savedsp .req x12
+
+ .global sha1_mb_asimd_x4
+ .type sha1_mb_asimd_x4, %function
+sha1_mb_asimd_x4:
+ cmp num_blocks, #0
+ beq .return
+ sha1_asimd_save_stack
+ mov savedsp,sp
+ sub databuf,sp,256
+ mov tmp,63
+ bic databuf,databuf,tmp
+ mov sp,databuf
+
+ add tmp,job0,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+ ld1 {VE.s}[0],[tmp]
+ ldr data0,[job0]
+
+ add tmp,job1,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+ ld1 {VE.s}[1],[tmp]
+ ldr data1,[job1]
+
+ add tmp,job2,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+ ld1 {VE.s}[2],[tmp]
+ ldr data2,[job2]
+
+ add tmp,job3,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+ ld1 {VE.s}[3],[tmp]
+ ldr data3,[job3]
+
+.block_loop:
+ mov dataptr,databuf
+ sha1_single
+ subs num_blocks, num_blocks, 1
+ bne .block_loop
+
+ add tmp,job0,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+ st1 {VE.s}[0],[tmp]
+
+ add tmp,job1,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+ st1 {VE.s}[1],[tmp]
+
+ add tmp,job2,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+ st1 {VE.s}[2],[tmp]
+
+ add tmp,job3,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+ st1 {VE.s}[3],[tmp]
+
+ mov sp,savedsp
+ sha1_asimd_restore_stack
+.return:
+ ret
+
+ .size sha1_mb_asimd_x4, .-sha1_mb_asimd_x4
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
new file mode 100644
index 000000000..4b34e7b53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+#include "endian_helper.h"
+
+extern void sha1_aarch64_x1(const uint8_t * data, int num_blocks, uint32_t digest[]);
+static inline void sha1_job_x1(SHA1_JOB * job, int blocks)
+{
+ sha1_aarch64_x1(job->buffer, blocks, job->result_digest);
+}
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_ASIMD_MAX_LANES 4
+void sha1_mb_asimd_x4(SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SHA1_MB_ASIMD_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ // lanes > SHA1_MB_ASIMD_MAX_LANES is invalid lane
+ for (; i < SHA1_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes, blocks;
+ int lane_idx_array[SHA1_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+ blocks = len >> 4;
+
+ /* for less-than-3-lane job, ASIMD really does not have much advantage
+ * compared to scalar due to wasted >= 50% capacity
+ * therefore we only run ASIMD for 3/4 lanes of data
+ */
+ if (lanes == SHA1_MB_ASIMD_MAX_LANES) {
+ sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, blocks);
+ } else if (lanes == 3) {
+ /* in case of 3 lanes, apparently ASIMD will still operate as if
+ * there were four lanes of data in processing (waste 25% capacity)
+ * theoretically we can let ASIMD implementation know the number of lanes
+ * so that it could "at least" save some memory loading time
+ * but in practice, we can just pass lane 0 as dummy for similar
+ * cache performance
+ */
+ SHA1_JOB dummy;
+ dummy.buffer = state->ldata[lane_idx_array[0]].job_in_lane->buffer;
+ dummy.len = state->ldata[lane_idx_array[0]].job_in_lane->len;
+ sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ &dummy,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, blocks);
+ } else {
+ sha1_job_x1(state->ldata[lane_idx_array[0]].job_in_lane, blocks);
+ if (lanes >= 2) {
+ sha1_job_x1(state->ldata[lane_idx_array[1]].job_in_lane, blocks);
+ }
+ }
+
+ // only return the min length job
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+ return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+ int i;
+ SHA1_JOB *ret = NULL;
+
+ for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+ int lane_idx;
+ // add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ // fatal error
+ assert(lane_idx < SHA1_MB_ASIMD_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA1_JOB *ret;
+
+ // add job into lanes
+ sha1_mb_mgr_insert_job(state, job);
+
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ // submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA1_MB_ASIMD_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha1_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha1_mb_mgr_do_jobs(state);
+#endif
+
+ // ~ i = lane_idx;
+ ret = sha1_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state)
+{
+ SHA1_JOB *ret;
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha1_mb_mgr_do_jobs(state);
+ return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
new file mode 100644
index 000000000..1dfd67d0c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
@@ -0,0 +1,208 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_CE_MAX_LANES 2
+#if SHA1_MB_CE_MAX_LANES >=2
+void sha1_mb_ce_x2(SHA1_JOB *, SHA1_JOB *, int);
+#endif
+void sha1_mb_ce_x1(SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+ for (; i < SHA1_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SHA1_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SHA1_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha1_mb_ce_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+
+#if SHA1_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sha1_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sha1_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+ int i;
+ SHA1_JOB *ret = NULL;
+
+ for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SHA1_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA1_JOB *ret;
+
+ //add job into lanes
+ sha1_mb_mgr_insert_job(state, job);
+
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA1_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha1_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha1_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sha1_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state)
+{
+ SHA1_JOB *ret;
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha1_mb_mgr_do_jobs(state);
+ return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
new file mode 100644
index 000000000..bb1929d76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
new file mode 100644
index 000000000..22f736793
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
@@ -0,0 +1,194 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+ sha1h \e0\()_s, \abcd\()_s
+ \inst \abcd\()_q,\e1\()_s,\tmp1\()_v.4s
+ add \tmp1\()_v.4s,\msg3\()_v.4s,\k\()_v.4s
+ sha1su1 \msg0\()_v.4s,\msg3\()_v.4s
+ sha1su0 \msg1\()_v.4s,\msg2\()_v.4s,\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key_0,28
+ declare_var_vector_reg key_1,29
+ declare_var_vector_reg key_2,30
+ declare_var_vector_reg key_3,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg abcd,0
+ declare_var_vector_reg e0,1
+ declare_var_vector_reg e1,2
+ declare_var_vector_reg abcd_saved,3
+ declare_var_vector_reg e0_saved,4
+/*
+Message variables
+*/
+ declare_var_vector_reg msg_0,16
+ declare_var_vector_reg msg_1,17
+ declare_var_vector_reg msg_2,18
+ declare_var_vector_reg msg_3,19
+/*
+Temporay variables
+*/
+ declare_var_vector_reg tmp_0,5
+ declare_var_vector_reg tmp_1,6
+
+/*
+ void sha1_mb_ce_x1(SHA1_JOB * job, int len);
+*/
+/*
+Arguements list
+*/
+ job .req x0
+ len .req w1
+ data .req x2
+ tmp .req x3
+ .global sha1_mb_ce_x1
+ .type sha1_mb_ce_x1, %function
+sha1_mb_ce_x1:
+ ldr data, [job]
+ ldr abcd_q, [job, 64]
+ ldr e0_s, [job, 80]
+ adr tmp, KEY
+ ld1 {key_0_v.4s-key_3_v.4s},[tmp]
+
+start_loop:
+
+ //load msgs
+ ld1 {msg_0_v.4s-msg_3_v.4s},[data]
+
+ //adjust loop parameter
+ add data,data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov abcd_saved_v.16b,abcd_v.16b
+ mov e0_saved_v.16b,e0_v.16b
+
+ rev32 msg_0_v.16b,msg_0_v.16b
+ rev32 msg_1_v.16b,msg_1_v.16b
+ add tmp_0_v.4s,msg_0_v.4s,key_0_v.4s
+ rev32 msg_2_v.16b,msg_2_v.16b
+ add tmp_1_v.4s,msg_1_v.4s,key_0_v.4s
+ rev32 msg_3_v.16b,msg_3_v.16b
+
+ /* rounds 0-3 */
+ sha1h e1_s,abcd_s
+ sha1c abcd_q,e0_s,tmp_0_v.4s
+ add tmp_0_v.4s,msg_2_v.4s,key_0_v.4s
+ sha1su0 msg_0_v.4s,msg_1_v.4s,msg_2_v.4s
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+ /* rounds 68-71 */
+ sha1h e0_s,abcd_s
+ sha1p abcd_q,e1_s,tmp_1_v.4s
+ add tmp_1_v.4s,msg_3_v.4s,key_3_v.4s
+ sha1su1 msg_0_v.4s,msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h e1_s,abcd_s
+ sha1p abcd_q,e0_s,tmp_0_v.4s
+
+ /* rounds 76-79 */
+ sha1h e0_s,abcd_s
+ sha1p abcd_q,e1_s,tmp_1_v.4s
+
+
+
+ add abcd_v.4s,abcd_v.4s,abcd_saved_v.4s
+ add e0_v.2s,e0_v.2s,e0_saved_v.2s
+
+
+ bgt start_loop
+ str abcd_q, [job, 64]
+ str e0_s, [job, 80]
+
+ ret
+
+ .size sha1_mb_ce_x1, .-sha1_mb_ce_x1
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
new file mode 100644
index 000000000..93f653ad2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+ sha1h l0_\e0\()_s, l0_\abcd\()_s
+ sha1h l1_\e0\()_s, l1_\abcd\()_s
+
+ \inst l0_\abcd\()_q,l0_\e1\()_s,l0_\tmp1\()_v.4s
+ \inst l1_\abcd\()_q,l1_\e1\()_s,l1_\tmp1\()_v.4s
+
+ add l0_\tmp1\()_v.4s,l0_\msg3\()_v.4s,\k\()_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg3\()_v.4s,\k\()_v.4s
+
+ sha1su1 l0_\msg0\()_v.4s,l0_\msg3\()_v.4s
+ sha1su1 l1_\msg0\()_v.4s,l1_\msg3\()_v.4s
+
+ sha1su0 l0_\msg1\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha1su0 l1_\msg1\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key_0,28
+ declare_var_vector_reg key_1,29
+ declare_var_vector_reg key_2,30
+ declare_var_vector_reg key_3,31
+
+
+/*
+lane variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_e0,1
+ declare_var_vector_reg l0_e1,2
+ declare_var_vector_reg l0_abcd_saved,3
+ declare_var_vector_reg l0_e0_saved,4
+ declare_var_vector_reg l0_tmp_0,5
+ declare_var_vector_reg l0_tmp_1,6
+ declare_var_vector_reg l0_msg_0,16
+ declare_var_vector_reg l0_msg_1,17
+ declare_var_vector_reg l0_msg_2,18
+ declare_var_vector_reg l0_msg_3,19
+
+ declare_var_vector_reg l1_abcd,7
+ declare_var_vector_reg l1_e0,8
+ declare_var_vector_reg l1_e1,9
+ declare_var_vector_reg l1_abcd_saved,24
+ declare_var_vector_reg l1_e0_saved,25
+ declare_var_vector_reg l1_tmp_0,26
+ declare_var_vector_reg l1_tmp_1,27
+ declare_var_vector_reg l1_msg_0,20
+ declare_var_vector_reg l1_msg_1,21
+ declare_var_vector_reg l1_msg_2,22
+ declare_var_vector_reg l1_msg_3,23
+
+/*
+ void sha1_mb_ce_x2(SHA1_JOB * job_0, SHA1_JOB * job_1,int len);
+*/
+ l0_job .req x0
+ l1_job .req x1
+ len .req w2
+
+ l0_data .req x3
+ l1_data .req x4
+ tmp .req x5
+ .global sha1_mb_ce_x2
+ .type sha1_mb_ce_x2, %function
+sha1_mb_ce_x2:
+ //push d8,d9 to stack
+ stp d8, d9, [sp, -256]!
+
+ adr tmp, KEY
+ ld1 {key_0_v.4s-key_3_v.4s},[tmp]
+ ldr l0_data, [l0_job]
+ ldr l1_data, [l1_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_e0_s, [l0_job, 80]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_e0_s, [l1_job, 80]
+
+start_loop:
+
+ //load msgs
+ ld1 {l0_msg_0_v.4s-l0_msg_3_v.4s},[l0_data]
+ ld1 {l1_msg_0_v.4s-l1_msg_3_v.4s},[l1_data]
+
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b, l0_abcd_v.16b
+ mov l0_e0_saved_v.16b, l0_e0_v.16b
+ mov l1_abcd_saved_v.16b, l1_abcd_v.16b
+ mov l1_e0_saved_v.16b, l1_e0_v.16b
+
+ rev32 l0_msg_0_v.16b, l0_msg_0_v.16b
+ rev32 l0_msg_1_v.16b, l0_msg_1_v.16b
+ add l0_tmp_0_v.4s, l0_msg_0_v.4s, key_0_v.4s
+ rev32 l0_msg_2_v.16b, l0_msg_2_v.16b
+ add l0_tmp_1_v.4s, l0_msg_1_v.4s, key_0_v.4s
+ rev32 l0_msg_3_v.16b, l0_msg_3_v.16b
+
+ rev32 l1_msg_0_v.16b, l1_msg_0_v.16b
+ rev32 l1_msg_1_v.16b, l1_msg_1_v.16b
+ add l1_tmp_0_v.4s, l1_msg_0_v.4s, key_0_v.4s
+ rev32 l1_msg_2_v.16b, l1_msg_2_v.16b
+ add l1_tmp_1_v.4s, l1_msg_1_v.4s, key_0_v.4s
+ rev32 l1_msg_3_v.16b, l1_msg_3_v.16b
+
+ /* rounds 0-3 */
+ sha1h l0_e1_s, l0_abcd_s
+ sha1c l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s
+ add l0_tmp_0_v.4s, l0_msg_2_v.4s, key_0_v.4s
+ sha1su0 l0_msg_0_v.4s, l0_msg_1_v.4s, l0_msg_2_v.4s
+
+ sha1h l1_e1_s, l1_abcd_s
+ sha1c l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s
+ add l1_tmp_0_v.4s, l1_msg_2_v.4s, key_0_v.4s
+ sha1su0 l1_msg_0_v.4s, l1_msg_1_v.4s, l1_msg_2_v.4s
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+ /* rounds 68-71 */
+ sha1h l0_e0_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s
+ add l0_tmp_1_v.4s, l0_msg_3_v.4s, key_3_v.4s
+ sha1su1 l0_msg_0_v.4s, l0_msg_3_v.4s
+
+ sha1h l1_e0_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s
+ add l1_tmp_1_v.4s, l1_msg_3_v.4s, key_3_v.4s
+ sha1su1 l1_msg_0_v.4s, l1_msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h l0_e1_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s
+
+ sha1h l1_e1_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s
+
+ /* rounds 76-79 */
+ sha1h l0_e0_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s
+
+ sha1h l1_e0_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s
+
+
+
+ add l0_abcd_v.4s, l0_abcd_v.4s, l0_abcd_saved_v.4s
+ add l0_e0_v.2s, l0_e0_v.2s, l0_e0_saved_v.2s
+ add l1_abcd_v.4s, l1_abcd_v.4s, l1_abcd_saved_v.4s
+ add l1_e0_v.2s, l1_e0_v.2s, l1_e0_saved_v.2s
+
+
+
+
+ bgt start_loop
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_e0_s, [l0_job, 80]
+
+
+ str l1_abcd_q, [l1_job, 64]
+ str l1_e0_s, [l1_job, 80]
+
+ //pop d8,d9 from stack
+ ldp d8, d9, [sp], 256
+ ret
+
+ .size sha1_mb_ce_x2, .-sha1_mb_ce_x2
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
new file mode 100644
index 000000000..ad91d64ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
@@ -0,0 +1,265 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx_slver_02020142;
+struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_avx_slver_02020143;
+struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_avx_slver_02020144;
+struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
new file mode 100644
index 000000000..85977d4c2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
@@ -0,0 +1,264 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx2_slver_04020145;
+struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_submit_avx2_slver_04020146;
+struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_flush_avx2_slver_04020147;
+struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
new file mode 100644
index 000000000..90e087163
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
@@ -0,0 +1,271 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_slver_0600014a;
+struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b;
+struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c;
+struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
new file mode 100644
index 000000000..2013f829a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
@@ -0,0 +1,281 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ * sha1_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ * Since SHANI is still slower than multibuffer for full lanes,
+ * sha1_ctx_mgr_init_avx512_ni and sha1_ctx_mgr_submit_avx512_ni are
+ * similar with their avx512 versions.
+ * sha1_ctx_mgr_flush_avx512_ni is different. It will call
+ * sha1_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ * than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_ni_slver_080002c4;
+struct slver sha1_ctx_mgr_init_avx512_ni_slver = { 0x02c4, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver_080002c5;
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver = { 0x02c5, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver_080002c6;
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver = { 0x02c6, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
new file mode 100644
index 000000000..90481efd0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
@@ -0,0 +1,325 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha1_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+
+void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ sha1_init(ctx, buffer, len);
+ sha1_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ sha1_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sha1_update(ctx, buffer, len);
+ sha1_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sha1_init(ctx, buffer, len);
+ remain_len = sha1_update(ctx, buffer, len);
+ sha1_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SHA1_BLOCK_SIZE) {
+ sha1_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SHA1_BLOCK_SIZE);
+ remain_len -= SHA1_BLOCK_SIZE;
+ ctx->total_length += SHA1_BLOCK_SIZE;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sha1_single(buf, digest);
+ if (i == 2 * SHA1_BLOCK_SIZE) {
+ sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha1_single(const void *data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sha1_ctx_mgr_init_base_slver_00000192;
+struct slver sha1_ctx_mgr_init_base_slver = { 0x0192, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_base_slver_00000193;
+struct slver sha1_ctx_mgr_submit_base_slver = { 0x0193, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_base_slver_00000194;
+struct slver sha1_ctx_mgr_flush_base_slver = { 0x0194, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
new file mode 100644
index 000000000..32eb07f6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr);
+
+void sha1_ctx_mgr_init(SHA1_HASH_CTX_MGR * mgr)
+{
+ return sha1_ctx_mgr_init_base(mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sha1_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush(SHA1_HASH_CTX_MGR * mgr)
+{
+ return sha1_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
new file mode 100644
index 000000000..db70ee015
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
@@ -0,0 +1,251 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_slver_00020139;
+struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_sse_slver_00020140;
+struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_sse_slver_00020141;
+struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
new file mode 100644
index 000000000..d3c7687d2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
@@ -0,0 +1,259 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ // Same with sse
+ sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_ni_slver_070002c1;
+struct slver sha1_ctx_mgr_init_sse_ni_slver = { 0x02c1, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_submit_sse_ni_slver_070002c2;
+struct slver sha1_ctx_mgr_submit_sse_ni_slver = { 0x02c2, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_flush_sse_ni_slver_070002c3;
+struct slver sha1_ctx_mgr_flush_sse_ni_slver = { 0x02c3, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
new file mode 100644
index 000000000..1c9a66fd4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
@@ -0,0 +1,67 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA1_SB_THRESHOLD_SSE 1
+%define SHA1_SB_THRESHOLD_AVX 1
+%define SHA1_SB_THRESHOLD_AVX2 1
+%define SHA1_SB_THRESHOLD_AVX512 1
+%define SHA1_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha1_mb
+%define SHA1_NI_SB_THRESHOLD_AVX512 6
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 5*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _SHA1_JOB_size _FIELD_OFFSET
+%assign _SHA1_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
new file mode 100644
index 000000000..4bf2e09b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SHA1_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA1_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SHA1_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sha1_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SHA1_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sha1_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sha1_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else if (num_unchanged)
+ printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+ num_unchanged, num_unchanged + 1);
+ else
+ printf("SHA-NI is not used, or used for last job\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..21c81403b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_ARGS_X16
+;;; name size align
+FIELD _digest, 4*5*16, 16 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA1_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..c5fd71300
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx, function
+sha1_mb_mgr_flush_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..a47ae2838
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha1_mb_x8_avx2 and sha1_opt_x1
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx2, function
+sha1_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX2
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..5e3db5b9b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
@@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged
+%define job arg2 ; unused
+%define len2 arg2 ; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx rbp ; unchanged
+
+%define unused_lanes rbx ; covered
+%define lane_data rbx ; covered
+%define tmp2 rbx ; covered
+
+%define num_lanes_inuse r9 ; covered
+
+%define job_rax rax ; covered
+%define tmp rax ; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512, function
+sha1_mb_mgr_flush_avx512:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_flush_avx512
+no_sha1_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..4170b6c73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,278 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha1_mb_x16_avx512
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged
+%define job arg2 ; unused
+%define len2 arg2 ; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx rbp ; unchanged
+
+%define unused_lanes rbx ; covered
+%define lane_data rbx ; covered
+%define tmp2 rbx ; covered
+
+%define num_lanes_inuse r9 ; covered
+
+%define job_rax rax ; covered
+%define tmp rax ; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512_ni, function
+sha1_mb_mgr_flush_avx512_ni:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+ %else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_avx512_ni
+ no_sha1_mb_mgr_flush_avx512_ni:
+ %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_avx512_ni
+ no_sha1_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..2a4c4b50a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse, function
+sha1_mb_mgr_flush_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..ea3cffd33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse_ni(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse_ni, function
+sha1_mb_mgr_flush_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_sse_ni
+ no_sha1_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b6124486a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..033fb3c9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
new file mode 100644
index 000000000..811c4a9dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..49c018138
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx, function
+sha1_mb_mgr_submit_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*4 + 16*0], xmm6
+ vmovdqa [rsp + 8*4 + 16*1], xmm7
+ vmovdqa [rsp + 8*4 + 16*2], xmm8
+ vmovdqa [rsp + 8*4 + 16*3], xmm9
+ vmovdqa [rsp + 8*4 + 16*4], xmm10
+ vmovdqa [rsp + 8*4 + 16*5], xmm11
+ vmovdqa [rsp + 8*4 + 16*6], xmm12
+ vmovdqa [rsp + 8*4 + 16*7], xmm13
+ vmovdqa [rsp + 8*4 + 16*8], xmm14
+ vmovdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*4 + 16*0]
+ vmovdqa xmm7, [rsp + 8*4 + 16*1]
+ vmovdqa xmm8, [rsp + 8*4 + 16*2]
+ vmovdqa xmm9, [rsp + 8*4 + 16*3]
+ vmovdqa xmm10, [rsp + 8*4 + 16*4]
+ vmovdqa xmm11, [rsp + 8*4 + 16*5]
+ vmovdqa xmm12, [rsp + 8*4 + 16*6]
+ vmovdqa xmm13, [rsp + 8*4 + 16*7]
+ vmovdqa xmm14, [rsp + 8*4 + 16*8]
+ vmovdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..95b4f1715
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
@@ -0,0 +1,250 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%define extra_blocks rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%define extra_blocks rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+; idx must be a register not clobberred by sha1_x8_avx2
+%define idx r8
+%define last_len r8
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx2, function
+sha1_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*32], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..a4f9389a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
@@ -0,0 +1,248 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged, mb_x16's input1
+%define job arg2 ; arg2 unused
+%define len2 arg2 ; arg2 unused, mb_x16's input2
+
+; idx must be a register not clobberred by sha1_x16_avx512
+%define idx r8 ; unchanged
+
+%define p r11 ; unused
+
+%define unused_lanes rbx ; covered
+
+%define job_rax rax ; covered
+%define len rax ; unused
+
+%define lane rbp ; unused
+
+%define tmp r9 ; covered
+%define num_lanes_inuse r9 ; covered
+
+%define lane_data r10 ; covered
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx512, function
+sha1_mb_mgr_submit_avx512:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*64], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF ; idx represent min length index
+ shr len2, 4 ; size in blocks
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_submit_avx512
+no_sha1_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..9989a9a1d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse, function
+sha1_mb_mgr_submit_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ movdqa [rsp + 8*4 + 16*0], xmm6
+ movdqa [rsp + 8*4 + 16*1], xmm7
+ movdqa [rsp + 8*4 + 16*2], xmm8
+ movdqa [rsp + 8*4 + 16*3], xmm9
+ movdqa [rsp + 8*4 + 16*4], xmm10
+ movdqa [rsp + 8*4 + 16*5], xmm11
+ movdqa [rsp + 8*4 + 16*6], xmm12
+ movdqa [rsp + 8*4 + 16*7], xmm13
+ movdqa [rsp + 8*4 + 16*8], xmm14
+ movdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*4 + 16*0]
+ movdqa xmm7, [rsp + 8*4 + 16*1]
+ movdqa xmm8, [rsp + 8*4 + 16*2]
+ movdqa xmm9, [rsp + 8*4 + 16*3]
+ movdqa xmm10, [rsp + 8*4 + 16*4]
+ movdqa xmm11, [rsp + 8*4 + 16*5]
+ movdqa xmm12, [rsp + 8*4 + 16*6]
+ movdqa xmm13, [rsp + 8*4 + 16*7]
+ movdqa xmm14, [rsp + 8*4 + 16*8]
+ movdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..979324de4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,290 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*6 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse_ni(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse_ni, function
+sha1_mb_mgr_submit_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ movdqa [rsp + 8*4 + 16*0], xmm6
+ movdqa [rsp + 8*4 + 16*1], xmm7
+ movdqa [rsp + 8*4 + 16*2], xmm8
+ movdqa [rsp + 8*4 + 16*3], xmm9
+ movdqa [rsp + 8*4 + 16*4], xmm10
+ movdqa [rsp + 8*4 + 16*5], xmm11
+ movdqa [rsp + 8*4 + 16*6], xmm12
+ movdqa [rsp + 8*4 + 16*7], xmm13
+ movdqa [rsp + 8*4 + 16*8], xmm14
+ movdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+
+ cmp unused_lanes, 0xF32 ; we will process two jobs at the same time
+ jne return_null ; wait for another sha_ni job
+
+ ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+ %if SHA1_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb
+ ; shani glue code
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+ ; lensN-len2=idx
+ sub lens0, len2
+ sub lens1, len2
+
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x2
+ ; state and idx are intact
+
+ %else
+ ; original mb code
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+ %endif
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*4 + 16*0]
+ movdqa xmm7, [rsp + 8*4 + 16*1]
+ movdqa xmm8, [rsp + 8*4 + 16*2]
+ movdqa xmm9, [rsp + 8*4 + 16*3]
+ movdqa xmm10, [rsp + 8*4 + 16*4]
+ movdqa xmm11, [rsp + 8*4 + 16*5]
+ movdqa xmm12, [rsp + 8*4 + 16*6]
+ movdqa xmm13, [rsp + 8*4 + 16*7]
+ movdqa xmm14, [rsp + 8*4 + 16*8]
+ movdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_submit_sse_ni
+ no_sha1_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
new file mode 100644
index 000000000..3925a6f4b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA1(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
new file mode 100644
index 000000000..4eeeaba0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
@@ -0,0 +1,202 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha1_mb test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
new file mode 100644
index 000000000..aaa52a0ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
@@ -0,0 +1,297 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA1_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA1_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha1 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Run sha1_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA1_BLOCK_SIZE +
+ SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
new file mode 100644
index 000000000..6261bbf44
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
@@ -0,0 +1,233 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static DigestSHA1 expResultDigest1 =
+ { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static DigestSHA1 expResultDigest2 =
+ { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static DigestSHA1 expResultDigest3 =
+ { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static DigestSHA1 expResultDigest4 =
+ { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static DigestSHA1 expResultDigest5 =
+ { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static DigestSHA1 expResultDigest6 =
+ { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+
+static uint8_t msg7[] = "";
+static DigestSHA1 expResultDigest7 =
+ { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha1 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..bd8e5e527
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha1_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha1_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha1" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..0b4438d53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA1_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha1_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha1_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha1" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
new file mode 100644
index 000000000..d64ffe2bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
@@ -0,0 +1,563 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using AVX-512
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define KT zmm5
+%define AA zmm6
+%define BB zmm7
+%define CC zmm8
+%define DD zmm9
+%define EE zmm10
+%define TMP0 zmm11
+%define TMP1 zmm12
+%define TMP2 zmm13
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 TMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold TMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, TMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha1_mb_x16_avx512)
+sha1_mb_x16_avx512:
+ endbranch
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; Save digests for later addition
+ vmovdqa32 AA, A
+ vmovdqa32 BB, B
+ vmovdqa32 CC, C
+ vmovdqa32 DD, D
+ vmovdqa32 EE, E
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Check if this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+ ; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+
+ ret
+
+section .data
+align 64
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_x16_avx512
+no_sha1_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
new file mode 100644
index 000000000..eb67309da
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
@@ -0,0 +1,416 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = ((B & C) | ((~ B) & D) )
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpand %%regF, %%regB,%%regC
+ vpandn %%regT, %%regB,%%regD
+ vpor %%regF, %%regT,%%regF
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+;%ifdef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_avx, function, internal
+sha1_mb_x4_avx:
+ endbranch
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ vmovdqa A, [ARG1 + 0*16]
+ vmovdqa B, [ARG1 + 1*16]
+ vmovdqa C, [ARG1 + 2*16]
+ vmovdqa D, [ARG1 + 3*16]
+ vmovdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ VMOVPS T2,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T4,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*4+0)*16],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*4+1)*16],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*4+2)*16],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqa [ARG1 + 0*16], A
+ vmovdqa [ARG1 + 1*16], B
+ vmovdqa [ARG1 + 2*16], C
+ vmovdqa [ARG1 + 3*16], D
+ vmovdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
new file mode 100644
index 000000000..5677dce73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
@@ -0,0 +1,413 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ paddd %%regE,[rsp + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
+ pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_sse, function, internal
+sha1_mb_x4_sse:
+ endbranch
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ movdqa A, [ARG1 + 0*16]
+ movdqa B, [ARG1 + 1*16]
+ movdqa C, [ARG1 + 2*16]
+ movdqa D, [ARG1 + 3*16]
+ movdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ MOVPS T2,[inp0+IDX]
+ MOVPS T1,[inp1+IDX]
+ MOVPS T4,[inp2+IDX]
+ MOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ pshufb T0, F
+ movdqa [rsp+(I*4+0)*16],T0
+ pshufb T1, F
+ movdqa [rsp+(I*4+1)*16],T1
+ pshufb T2, F
+ movdqa [rsp+(I*4+2)*16],T2
+ pshufb T3, F
+ movdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ movdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ movdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ movdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ movdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ paddd A,AA
+ paddd B,BB
+ paddd C,CC
+ paddd D,DD
+ paddd E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ movdqa [ARG1 + 0*16], A
+ movdqa [ARG1 + 1*16], B
+ movdqa [ARG1 + 2*16], C
+ movdqa [ARG1 + 3*16], D
+ movdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
new file mode 100644
index 000000000..edcba6d3f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
@@ -0,0 +1,518 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; F0 = ((B & C) | ((~B) & D))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpand %%regF, %%regB,%%regC
+ vpandn %%regT, %%regB,%%regD
+ vpor %%regF, %%regT,%%regF
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqu W14, [rsp + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqu [rsp + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define YMM_SAVE (15-15)*32
+%define FRAMESZ 32*16 + 0*8 + YMM_SAVE
+%define _YMM FRAMESZ - YMM_SAVE
+
+%define VMOVPS vmovups
+
+%define IDX rax
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define inp7 rcx
+ %define arg1 rdi
+ %define arg2 rsi
+ %define RSP_SAVE rdx
+%else
+ %define inp7 rdi
+ %define arg1 rcx
+ %define arg2 rdx
+ %define RSP_SAVE rsi
+%endif
+
+
+; ymm0 A
+; ymm1 B
+; ymm2 C
+; ymm3 D
+; ymm4 E
+; ymm5 F AA
+; ymm6 T0 BB
+; ymm7 T1 CC
+; ymm8 T2 DD
+; ymm9 T3 EE
+; ymm10 T4 TMP
+; ymm11 T5 FUN
+; ymm12 T6 K
+; ymm13 T7 W14
+; ymm14 T8 W15
+; ymm15 T9 W16
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (8*5*4) ; 8 streams x 5 32bit words per digest x 4 bytes per word
+
+align 32
+
+; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+mk_global sha1_mb_x8_avx2, function, internal
+sha1_mb_x8_avx2:
+ endbranch
+
+ push RSP_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be even multiple of 8
+
+ ; align rsp to 32 Bytes
+ and rsp, ~0x1F
+
+ ;; Initialize digests
+ vmovdqu A, [arg1 + 0*32]
+ vmovdqu B, [arg1 + 1*32]
+ vmovdqu C, [arg1 + 2*32]
+ vmovdqu D, [arg1 + 3*32]
+ vmovdqu E, [arg1 + 4*32]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1+_data_ptr+0*8]
+ mov inp1,[arg1+_data_ptr+1*8]
+ mov inp2,[arg1+_data_ptr+2*8]
+ mov inp3,[arg1+_data_ptr+3*8]
+ mov inp4,[arg1+_data_ptr+4*8]
+ mov inp5,[arg1+_data_ptr+5*8]
+ mov inp6,[arg1+_data_ptr+6*8]
+ mov inp7,[arg1+_data_ptr+7*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqu F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+ VMOVPS T0,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T2,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ VMOVPS T4,[inp4+IDX]
+ VMOVPS T5,[inp5+IDX]
+ VMOVPS T6,[inp6+IDX]
+ VMOVPS T7,[inp7+IDX]
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb T0, T0, F
+ vmovdqu [rsp+(I*8+0)*32],T0
+ vpshufb T1, T1, F
+ vmovdqu [rsp+(I*8+1)*32],T1
+ vpshufb T2, T2, F
+ vmovdqu [rsp+(I*8+2)*32],T2
+ vpshufb T3, T3, F
+ vmovdqu [rsp+(I*8+3)*32],T3
+ vpshufb T4, T4, F
+ vmovdqu [rsp+(I*8+4)*32],T4
+ vpshufb T5, T5, F
+ vmovdqu [rsp+(I*8+5)*32],T5
+ vpshufb T6, T6, F
+ vmovdqu [rsp+(I*8+6)*32],T6
+ vpshufb T7, T7, F
+ vmovdqu [rsp+(I*8+7)*32],T7
+ add IDX, 32
+%assign I (I+1)
+%endrep
+
+
+ ; save old digests
+ vmovdqu AA, A
+ vmovdqu BB, B
+ vmovdqu CC, C
+ vmovdqu DD, D
+ vmovdqu EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqu K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqu W16, [rsp + ((16 - 16) & 15) * 32]
+ vmovdqu W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqu K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqu K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqu K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqu [arg1 + 0*32], A
+ vmovdqu [arg1 + 1*32], B
+ vmovdqu [arg1 + 2*32], C
+ vmovdqu [arg1 + 3*32], D
+ vmovdqu [arg1 + 4*32], E
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1+_data_ptr+0*8], inp0
+ mov [arg1+_data_ptr+1*8], inp1
+ mov [arg1+_data_ptr+2*8], inp2
+ mov [arg1+_data_ptr+3*8], inp3
+ mov [arg1+_data_ptr+4*8], inp4
+ mov [arg1+_data_ptr+5*8], inp5
+ mov [arg1+_data_ptr+6*8], inp6
+ mov [arg1+_data_ptr+7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ mov rsp, RSP_SAVE
+
+ pop RSP_SAVE
+ ret
+
+
+
+section .data align=32
+
+align 32
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
new file mode 100644
index 000000000..e778c5d98
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
@@ -0,0 +1,112 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Test messages
+#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+uint8_t msg3[] = TST_STR TST_STR "0123456789:;<";
+uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?";
+uint8_t msg6[] =
+ TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+uint8_t msg7[] = "";
+
+// Expected digests
+uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 };
+
+int check_job(uint32_t * ref, uint32_t * good, int words)
+{
+ int i;
+ for (i = 0; i < words; i++)
+ if (good[i] != ref[i])
+ return 1;
+
+ return 0;
+}
+
+#define MAX_MSGS 7
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[MAX_MSGS];
+ SHA1_HASH_CTX *p_job;
+ int i, checked = 0, failed = 0;
+ int n = sizeof(msgs) / sizeof(msgs[0]);
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+ // Initialize multi-buffer manager
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < n; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)expected_digest[i];
+
+ p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (p_job) { // If we have finished a job, process it
+ checked++;
+ failed +=
+ check_job(p_job->job.result_digest, p_job->user_data,
+ SHA1_DIGEST_NWORDS);
+ }
+ }
+
+ // Finish remaining jobs
+ while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) {
+ checked++;
+ failed +=
+ check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS);
+ }
+
+ printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed);
+ return failed;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
new file mode 100644
index 000000000..c205f2389
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
@@ -0,0 +1,131 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha1_ctx_mgr_init_sse
+extern sha1_ctx_mgr_submit_sse
+extern sha1_ctx_mgr_flush_sse
+
+extern sha1_ctx_mgr_init_avx
+extern sha1_ctx_mgr_submit_avx
+extern sha1_ctx_mgr_flush_avx
+
+extern sha1_ctx_mgr_init_avx2
+extern sha1_ctx_mgr_submit_avx2
+extern sha1_ctx_mgr_flush_avx2
+
+extern sha1_ctx_mgr_init_base
+extern sha1_ctx_mgr_submit_base
+extern sha1_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha1_ctx_mgr_init_avx512
+ extern sha1_ctx_mgr_submit_avx512
+ extern sha1_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha1_ctx_mgr_init_sse_ni
+ extern sha1_ctx_mgr_submit_sse_ni
+ extern sha1_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+ extern sha1_ctx_mgr_init_avx512_ni
+ extern sha1_ctx_mgr_submit_avx512_ni
+ extern sha1_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+ sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+ sha1_ctx_mgr_init_avx512, sha1_ctx_mgr_init_sse_ni, sha1_ctx_mgr_init_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+ sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+ sha1_ctx_mgr_submit_avx512, sha1_ctx_mgr_submit_sse_ni, sha1_ctx_mgr_submit_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+ sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+ sha1_ctx_mgr_flush_avx512, sha1_ctx_mgr_flush_sse_ni, sha1_ctx_mgr_flush_avx512_ni
+ %else
+ mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+ sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+ sha1_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+ sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+ sha1_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+ sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+ sha1_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+ sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+ sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+ sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_sse_ni
+ %else
+ mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+ sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2
+ mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+ sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+ sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;; func core, ver, snum
+slversion sha1_ctx_mgr_init, 00, 04, 0148
+slversion sha1_ctx_mgr_submit, 00, 04, 0149
+slversion sha1_ctx_mgr_flush, 00, 04, 0150
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
new file mode 100644
index 000000000..86d09e303
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
@@ -0,0 +1,318 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 32 ; space for ABCDE
+%define RSPSAVE rax
+
+%define ABCD xmm0
+; two E's b/c for ping-pong
+%define E0 xmm1
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+%define SHUF_MASK xmm7
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define TMP r9 ; local variable -- assistant to address digest
+;%define TMP2 r8 ; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm7
+;
+mk_global sha1_ni_x1, function, internal
+sha1_ni_x1:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*IDX]
+ ;; Initialize digest
+ pinsrd ABCD, [TMP + 0*NLANX4], 3
+ pinsrd ABCD, [TMP + 1*NLANX4], 2
+ pinsrd ABCD, [TMP + 2*NLANX4], 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCD, [TMP + 1*NLANX4], 0
+ pinsrd E0, [TMP + 2*NLANX4], 3
+ pand E0, [IDX3_WORD_MASK]
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; Save hash values for addition after rounds
+ movdqa [rsp + 0*16], E0
+ movdqa [rsp + 1*16], ABCD
+
+ ; do rounds 0-3
+ movdqu MSG0, [DPTR + 0*16]
+ pshufb MSG0, SHUF_MASK
+ paddd E0, MSG0
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+
+ ; do rounds 4-7
+ movdqu MSG1, [DPTR + 1*16]
+ pshufb MSG1, SHUF_MASK
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+
+ ; do rounds 8-11
+ movdqu MSG2, [DPTR + 2*16]
+ pshufb MSG2, SHUF_MASK
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 12-15
+ movdqu MSG3, [DPTR + 3*16]
+ pshufb MSG3, SHUF_MASK
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+
+ ; do rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+
+ ; do rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+
+ ; Add current hash values with previously saved
+ sha1nexte E0, [rsp + 0*16]
+ paddd ABCD, [rsp + 1*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*IDX]
+ pextrd [TMP + 0*NLANX4], ABCD, 3
+ pextrd [TMP + 1*NLANX4], ABCD, 2
+ pextrd [TMP + 2*NLANX4], ABCD, 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], ABCD, 0
+ pextrd [TMP + 2*NLANX4], E0, 3
+
+ ; update input pointers
+ mov [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ mov rsp, RSPSAVE
+
+ ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x1
+no_sha1_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
new file mode 100644
index 000000000..7b0ddb74e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
@@ -0,0 +1,484 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 64 ; space for ABCDE
+%define RSPSAVE rax
+
+%define ABCD xmm0
+; two E's b/c for ping-pong
+%define E0 xmm1
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+
+%define ABCDb xmm7
+%define E0b xmm8 ; Need two E's b/c they ping pong
+%define E1b xmm9
+%define MSG0b xmm10
+%define MSG1b xmm11
+%define MSG2b xmm12
+%define MSG3b xmm13
+
+%define SHUF_MASK xmm14
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define DPTRb r12 ;
+%define TMP r9 ; local variable -- assistant to address digest
+%define TMPb r13 ; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r13, xmm0-xmm14
+;
+mk_global sha1_ni_x2, function, internal
+sha1_ni_x2:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + _args_digest ];
+ lea TMPb,[MGR + _args_digest + 4*1];
+
+ ;; Initialize digest
+ pinsrd ABCD, [TMP + 0*NLANX4], 3
+ pinsrd ABCD, [TMP + 1*NLANX4], 2
+ pinsrd ABCD, [TMP + 2*NLANX4], 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCD, [TMP + 1*NLANX4], 0
+ pinsrd E0, [TMP + 2*NLANX4], 3
+ pand E0, [IDX3_WORD_MASK]
+
+ pinsrd ABCDb, [TMPb + 0*NLANX4], 3
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 2
+ pinsrd ABCDb, [TMPb + 2*NLANX4], 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 0
+ pinsrd E0b, [TMPb + 2*NLANX4], 3
+ pand E0b, [IDX3_WORD_MASK]
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr ]
+ mov DPTRb,[MGR + _data_ptr + 8*1]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ movdqa [rsp + 0*16], E0
+ movdqa [rsp + 1*16], ABCD
+
+ movdqa [rsp + 2*16], E0b
+ movdqa [rsp + 3*16], ABCDb
+
+ ; do rounds 0-3
+ movdqu MSG0, [DPTR + 0*16]
+ pshufb MSG0, SHUF_MASK
+ paddd E0, MSG0
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+
+ movdqu MSG0b, [DPTRb + 0*16]
+ pshufb MSG0b, SHUF_MASK
+ paddd E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+
+ ; do rounds 4-7
+ movdqu MSG1, [DPTR + 1*16]
+ pshufb MSG1, SHUF_MASK
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+
+ movdqu MSG1b, [DPTRb + 1*16]
+ pshufb MSG1b, SHUF_MASK
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG0b, MSG1b
+
+ ; do rounds 8-11
+ movdqu MSG2, [DPTR + 2*16]
+ pshufb MSG2, SHUF_MASK
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ movdqu MSG2b, [DPTRb + 2*16]
+ pshufb MSG2b, SHUF_MASK
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 12-15
+ movdqu MSG3, [DPTR + 3*16]
+ pshufb MSG3, SHUF_MASK
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ movdqu MSG3b, [DPTRb + 3*16]
+ pshufb MSG3b, SHUF_MASK
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 3
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 3
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 3
+ pxor MSG3b, MSG1b
+
+ ; do rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 3
+
+ ; do rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 3
+
+ ; Add current hash values with previously saved
+ sha1nexte E0, [rsp + 0*16]
+ paddd ABCD, [rsp + 1*16]
+
+ sha1nexte E0b, [rsp + 2*16]
+ paddd ABCDb, [rsp + 3*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ add DPTRb, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + _args_digest]
+ pextrd [TMP + 0*NLANX4], ABCD, 3
+ pextrd [TMP + 1*NLANX4], ABCD, 2
+ pextrd [TMP + 2*NLANX4], ABCD, 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], ABCD, 0
+ pextrd [TMP + 2*NLANX4], E0, 3
+
+ lea TMPb, [MGR +_args_digest + 4*1]
+ pextrd [TMPb + 0*NLANX4], ABCDb, 3
+ pextrd [TMPb + 1*NLANX4], ABCDb, 2
+ pextrd [TMPb + 2*NLANX4], ABCDb, 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMPb + 1*NLANX4], ABCDb, 0
+ pextrd [TMPb + 2*NLANX4], E0b, 3
+
+ ; update input pointers
+ mov [MGR + _data_ptr], DPTR
+ mov [MGR + _data_ptr + 8*1], DPTRb
+
+backto_mgr:
+;;;;;;;;;;;;;;;;
+;; Postamble
+
+ mov rsp, RSPSAVE
+
+ ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x2
+no_sha1_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
new file mode 100644
index 000000000..aeb00a008
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
@@ -0,0 +1,485 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
+_WK_SAVE_SIZE equ 16*4
+
+_WK_SAVE equ 0
+_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+; rax~rdx, rsi, rdi, rbp are used for RR
+%define N_MGR r8
+%define IDX r9 ; local variable -- consistent with caller
+%define K_BASE r11
+%define BUFFER_PTR r12
+%define BUFFER_END r13
+%define TMP r14 ; local variable -- assistant to address digest
+
+%xdefine W_TMP xmm0
+%xdefine W_TMP2 xmm9
+
+%xdefine W0 xmm1
+%xdefine W4 xmm2
+%xdefine W8 xmm3
+%xdefine W12 xmm4
+%xdefine W16 xmm5
+%xdefine W20 xmm6
+%xdefine W24 xmm7
+%xdefine W28 xmm8
+
+%xdefine XMM_SHUFB_BSWAP xmm10
+
+;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
+%xdefine WK(t) (rsp + (t & 15)*4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Constants
+
+%xdefine K1 0x5a827999
+%xdefine K2 0x6ed9eba1
+%xdefine K3 0x8f1bbcdc
+%xdefine K4 0xca62c1d6
+
+%xdefine W_PRECALC_AHEAD 16
+%xdefine W_NO_TAIL_PRECALC 0
+
+; Rounds macros
+
+%macro REGALLOC 0
+ %xdefine A ecx
+ %xdefine B esi
+ %xdefine C edi
+ %xdefine D ebp
+ %xdefine E edx
+
+ %xdefine T1 eax
+ %xdefine T2 ebx
+%endmacro
+
+%macro F1 3
+ mov T1,%2
+ xor T1,%3
+ and T1,%1
+ xor T1,%3
+%endmacro
+
+%macro F2 3
+ mov T1,%3
+ xor T1,%2
+ xor T1,%1
+%endmacro
+
+%macro F3 3
+ mov T1,%2
+ mov T2,%1
+ or T1,%1
+ and T2,%2
+ and T1,%3
+ or T1,T2
+%endmacro
+
+%define F4 F2
+
+%macro UPDATE_HASH 2
+ add %2, %1
+ mov %1, %2
+%endmacro
+
+
+%macro W_PRECALC 1
+ %xdefine i (%1)
+
+ %if (i < 20)
+ %xdefine K_XMM 0
+ %elif (i < 40)
+ %xdefine K_XMM 16
+ %elif (i < 60)
+ %xdefine K_XMM 32
+ %else
+ %xdefine K_XMM 48
+ %endif
+
+ %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
+
+ %if (W_NO_TAIL_PRECALC == 0)
+
+ %xdefine i ((%1) % 80) ;; pre-compute for the next iteration
+
+ %if (i == 0)
+ W_PRECALC_RESET
+ %endif
+
+
+ W_PRECALC_00_15
+ %endif
+
+ %elif (i < 32)
+ W_PRECALC_16_31
+ %elif (i < 80) ;; rounds 32-79
+ W_PRECALC_32_79
+ %endif
+%endmacro
+
+%macro W_PRECALC_RESET 0
+ %xdefine W W0
+ %xdefine W_minus_04 W4
+ %xdefine W_minus_08 W8
+ %xdefine W_minus_12 W12
+ %xdefine W_minus_16 W16
+ %xdefine W_minus_20 W20
+ %xdefine W_minus_24 W24
+ %xdefine W_minus_28 W28
+ %xdefine W_minus_32 W
+%endmacro
+
+%macro W_PRECALC_ROTATE 0
+ %xdefine W_minus_32 W_minus_28
+ %xdefine W_minus_28 W_minus_24
+ %xdefine W_minus_24 W_minus_20
+ %xdefine W_minus_20 W_minus_16
+ %xdefine W_minus_16 W_minus_12
+ %xdefine W_minus_12 W_minus_08
+ %xdefine W_minus_08 W_minus_04
+ %xdefine W_minus_04 W
+ %xdefine W W_minus_32
+%endmacro
+
+%macro W_PRECALC_00_15 0
+ ;; message scheduling pre-compute for rounds 0-15
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqu W_TMP, [BUFFER_PTR + (i * 4)]
+ %elif ((i & 3) == 1)
+ pshufb W_TMP, XMM_SHUFB_BSWAP
+ movdqa W, W_TMP
+ %elif ((i & 3) == 2)
+ paddd W_TMP, [K_BASE]
+ %elif ((i & 3) == 3)
+ movdqa [WK(i&~3)], W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_16_31 0
+ ;; message scheduling pre-compute for rounds 16-31
+ ;; calculating last 32 w[i] values in 8 XMM registers
+ ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
+ ;;
+ ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W, W_minus_12
+ palignr W, W_minus_16, 8 ;; w[i-14]
+ movdqa W_TMP, W_minus_04
+ psrldq W_TMP, 4 ;; w[i-3]
+ pxor W, W_minus_08
+ %elif ((i & 3) == 1)
+ pxor W_TMP, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP2, W
+ movdqa W_TMP, W
+ pslldq W_TMP2, 12
+ %elif ((i & 3) == 2)
+ psrld W, 31
+ pslld W_TMP, 1
+ por W_TMP, W
+ movdqa W, W_TMP2
+ psrld W_TMP2, 30
+ pslld W, 2
+ %elif ((i & 3) == 3)
+ pxor W_TMP, W
+ pxor W_TMP, W_TMP2
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_32_79 0
+ ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W_TMP, W_minus_04
+ pxor W, W_minus_28 ;; W is W_minus_32 before xor
+ palignr W_TMP, W_minus_08, 8
+ %elif ((i & 3) == 1)
+ pxor W, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP, W
+ %elif ((i & 3) == 2)
+ psrld W, 30
+ pslld W_TMP, 2
+ por W_TMP, W
+ %elif ((i & 3) == 3)
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation
+
+ ;; TEMP = A
+ ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
+ ;; C = ROTATE_LEFT( B, 30 )
+ ;; D = C
+ ;; E = D
+ ;; B = TEMP
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD)
+ F %2, %3, %4 ;; F returns result in T1
+ add %5, [WK(%6)]
+ rol %2, 30
+ mov T2, %1
+ add %4, [WK(%6 + 1)]
+ rol T2, 5
+ add %5, T1
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
+ add T2, %5
+ mov %5, T2
+ rol T2, 5
+ add %4, T2
+ F %1, %2, %3 ;; F returns result in T1
+ add %4, T1
+ rol %1, 30
+
+;; write: %1, %2
+;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs (except r15), xmm0-xmm10
+; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+mk_global sha1_opt_x1, function, internal
+sha1_opt_x1:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ mov [rsp + _GPR_SAVE + 8*3], rsi
+ ; caller has already stored XMM6~10
+%endif
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+ mov [rsp + _GPR_SAVE + 8*8], rdx
+
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz .lend
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ ;; let sha1_opt sb takes over r8~r11
+ ;; Load input pointers
+ mov N_MGR, MGR
+ mov BUFFER_PTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, BUFFER_PTR
+ mov BUFFER_END, NBLK
+
+ lea K_BASE, [K_XMM_AR]
+ movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
+
+ REGALLOC
+
+ lea TMP, [N_MGR + 4*IDX]
+ ;; Initialize digest
+ mov A, [TMP + 0*NLANX4]
+ mov B, [TMP + 1*NLANX4]
+ mov C, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ mov D, [TMP + 1*NLANX4]
+ mov E, [TMP + 2*NLANX4]
+
+ %assign i 0
+ %rep W_PRECALC_AHEAD
+ W_PRECALC i
+ %assign i i+1
+ %endrep
+
+ %xdefine F F1
+
+.lloop:
+ cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block,
+ jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE
+ jmp .lend
+
+.lbegin:
+ RR A,B,C,D,E,0
+ RR D,E,A,B,C,2
+ RR B,C,D,E,A,4
+ RR E,A,B,C,D,6
+ RR C,D,E,A,B,8
+
+ RR A,B,C,D,E,10
+ RR D,E,A,B,C,12
+ RR B,C,D,E,A,14
+ RR E,A,B,C,D,16
+ RR C,D,E,A,B,18
+
+ %xdefine F F2
+
+ RR A,B,C,D,E,20
+ RR D,E,A,B,C,22
+ RR B,C,D,E,A,24
+ RR E,A,B,C,D,26
+ RR C,D,E,A,B,28
+
+ RR A,B,C,D,E,30
+ RR D,E,A,B,C,32
+ RR B,C,D,E,A,34
+ RR E,A,B,C,D,36
+ RR C,D,E,A,B,38
+
+ %xdefine F F3
+
+ RR A,B,C,D,E,40
+ RR D,E,A,B,C,42
+ RR B,C,D,E,A,44
+ RR E,A,B,C,D,46
+ RR C,D,E,A,B,48
+
+ RR A,B,C,D,E,50
+ RR D,E,A,B,C,52
+ RR B,C,D,E,A,54
+ RR E,A,B,C,D,56
+ RR C,D,E,A,B,58
+
+ %xdefine F F4
+
+ add BUFFER_PTR, 64 ;; move to next 64-byte block
+ cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one
+ cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration
+
+ RR A,B,C,D,E,60
+ RR D,E,A,B,C,62
+ RR B,C,D,E,A,64
+ RR E,A,B,C,D,66
+ RR C,D,E,A,B,68
+
+ RR A,B,C,D,E,70
+ RR D,E,A,B,C,72
+ RR B,C,D,E,A,74
+ RR E,A,B,C,D,76
+ RR C,D,E,A,B,78
+
+ lea TMP, [N_MGR + 4*IDX]
+ UPDATE_HASH [TMP + 0*NLANX4],A
+ UPDATE_HASH [TMP + 1*NLANX4],B
+ UPDATE_HASH [TMP + 2*NLANX4],C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ UPDATE_HASH [TMP + 1*NLANX4],D
+ UPDATE_HASH [TMP + 2*NLANX4],E
+
+ jmp .lloop
+
+ .lend:
+ mov MGR, N_MGR
+
+ mov rdx, [rsp + _GPR_SAVE + 8*8]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + _GPR_SAVE + 8*3]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbp, [rsp + _GPR_SAVE + 8*1]
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+
+;;----------------------
+section .data align=64
+
+align 128
+K_XMM_AR:
+ DD K1, K1, K1, K1
+ DD K2, K2, K2, K2
+ DD K3, K3, K3, K3
+ DD K4, K4, K4, K4
+
+align 16
+bswap_shufb_ctl:
+ DD 00010203h
+ DD 04050607h
+ DD 08090a0bh
+ DD 0c0d0e0fh
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
new file mode 100644
index 000000000..e82fb30fe
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
@@ -0,0 +1,220 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+#define H4 0xc3d2e1f0
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void OPT_FIX sha1_single(const uint8_t * data, uint32_t digest[]);
+
+void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha1_single(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+void sha1_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}