summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/mh_sha256
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/mh_sha256')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am88
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S731
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S35
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c70
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm557
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm616
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm682
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c188
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm557
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c121
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h318
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm77
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c180
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c410
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c110
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c240
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c176
22 files changed, 5658 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
new file mode 100644
index 000000000..d6e8b61ab
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
@@ -0,0 +1,88 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_sha256 = mh_sha256/sha256_for_mh_sha256.c
+
+lsrc_mh_sha256 = mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_block_sse.asm \
+ mh_sha256/mh_sha256_block_avx.asm \
+ mh_sha256/mh_sha256_block_avx2.asm \
+ mh_sha256/mh_sha256_multibinary.asm \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+lsrc_mh_sha256 += mh_sha256/mh_sha256_block_avx512.asm \
+ mh_sha256/mh_sha256_avx512.c
+
+lsrc_x86_64 += $(lsrc_sha256) \
+ $(lsrc_mh_sha256)
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+other_src += mh_sha256/mh_sha256_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha256/mh_sha256_internal.h
+
+lsrc_aarch64 += $(lsrc_sha256) \
+ mh_sha256/aarch64/mh_sha256_multibinary.S \
+ mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c \
+ mh_sha256/aarch64/mh_sha256_block_ce.S \
+ mh_sha256/aarch64/mh_sha256_ce.c \
+ mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+lsrc_base_aliases += $(lsrc_sha256) \
+ mh_sha256/mh_sha256_base_aliases.c \
+ mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+src_include += -I $(srcdir)/mh_sha256
+
+extern_hdrs += include/mh_sha256.h
+
+check_tests += mh_sha256/mh_sha256_test
+unit_tests += mh_sha256/mh_sha256_update_test
+
+perf_tests += mh_sha256/mh_sha256_perf
+
+
+mh_sha256_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_update_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_update_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_mh_sha256_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
new file mode 100644
index 000000000..155790fc1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_update)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(mh_sha256_update_ce);
+
+ return PROVIDER_BASIC(mh_sha256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_finalize)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(mh_sha256_finalize_ce);
+
+ return PROVIDER_BASIC(mh_sha256_finalize);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
new file mode 100644
index 000000000..53a78ea7d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
@@ -0,0 +1,731 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 6
+
+ .global mh_sha256_block_ce
+ .type mh_sha256_block_ce, %function
+
+/*
+Macros
+*/
+
+.macro declare_vector_reg name:req,reg:req,default:req
+ \name .req \default\reg
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+declare_vector_reg lane0_msg0, 0,v
+declare_vector_reg lane1_msg0, 1,v
+declare_vector_reg lane2_msg0, 2,v
+declare_vector_reg lane3_msg0, 3,v
+
+declare_vector_reg lane0_msg1, 4,v
+declare_vector_reg lane1_msg1, 5,v
+declare_vector_reg lane2_msg1, 6,v
+declare_vector_reg lane3_msg1, 7,v
+
+declare_vector_reg lane0_msg2, 8,v
+declare_vector_reg lane1_msg2, 9,v
+declare_vector_reg lane2_msg2, 10,v
+declare_vector_reg lane3_msg2, 11,v
+
+declare_vector_reg lane0_msg3, 12,v
+declare_vector_reg lane1_msg3, 13,v
+declare_vector_reg lane2_msg3, 14,v
+declare_vector_reg lane3_msg3, 15,v
+
+declare_vector_reg lane0_state0, 16,v
+declare_vector_reg lane1_state0, 17,v
+declare_vector_reg lane2_state0, 18,v
+declare_vector_reg lane3_state0, 19,v
+
+declare_vector_reg lane0_state1, 20,v
+declare_vector_reg lane1_state1, 21,v
+declare_vector_reg lane2_state1, 22,v
+declare_vector_reg lane3_state1, 23,v
+
+declare_vector_reg lane0_tmp0, 24,v
+declare_vector_reg lane1_tmp0, 25,v
+declare_vector_reg lane2_tmp0, 26,v
+declare_vector_reg lane3_tmp0, 27,v
+
+declare_vector_reg lane0_tmp2, 28,v
+declare_vector_reg lane1_tmp2, 29,v
+declare_vector_reg lane2_tmp2, 30,v
+declare_vector_reg lane3_tmp2, 31,v
+
+declare_vector_reg key, 27,v
+declare_vector_reg tmp, 29,v
+
+/*
+void mh_sha256_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE],
+ uint32_t num_blocks);
+*/
+ x_input_data .req x0
+ x_digests .req x1
+ x_frame_buffer .req x2
+ w_num_blocks .req w3
+
+ x_digest_addr .req x4
+ x_key_addr .req x5
+ x_msg_addr .req x6
+ x_lane_offs .req x7
+ x_offs .req x9
+ w_input_data_end .req w10
+ x_input_data_end .req x10
+ x_tmp .req x11
+mh_sha256_block_ce:
+ cbz w_num_blocks, .exit
+ mov w_input_data_end, w_num_blocks
+
+ ubfiz x_input_data_end, x_input_data_end, 10, 32
+ add x_input_data_end, x_input_data, x_input_data_end
+
+ adrp x_key_addr, .key_addr
+ add x_key_addr, x_key_addr, :lo12:.key_addr
+
+ stp d8, d9, [sp, -192]!
+
+ stp d10, d11, [sp, 16]
+ stp d12, d13, [sp, 32]
+ stp d14, d15, [sp, 48]
+
+ .p2align 3,,7
+.start_loop:
+ mov x_lane_offs, 0
+ mov x_digest_addr, x_digests
+
+.lane_loop:
+ add x_msg_addr, x_input_data, x_lane_offs, lsl 2
+
+ .p2align 3,,7
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+ add x_tmp, x_digest_addr, 256
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[3], [x_msg_addr], x_offs
+
+ // reverse for little endian
+ rev32 v_lane0_msg0.16b, v_lane0_msg0.16b
+ rev32 v_lane1_msg0.16b, v_lane1_msg0.16b
+ rev32 v_lane2_msg0.16b, v_lane2_msg0.16b
+ rev32 v_lane3_msg0.16b, v_lane3_msg0.16b
+
+ rev32 v_lane0_msg1.16b, v_lane0_msg1.16b
+ rev32 v_lane1_msg1.16b, v_lane1_msg1.16b
+ rev32 v_lane2_msg1.16b, v_lane2_msg1.16b
+ rev32 v_lane3_msg1.16b, v_lane3_msg1.16b
+
+ rev32 v_lane0_msg2.16b, v_lane0_msg2.16b
+ rev32 v_lane1_msg2.16b, v_lane1_msg2.16b
+ rev32 v_lane2_msg2.16b, v_lane2_msg2.16b
+ rev32 v_lane3_msg2.16b, v_lane3_msg2.16b
+
+ rev32 v_lane0_msg3.16b, v_lane0_msg3.16b
+ rev32 v_lane1_msg3.16b, v_lane1_msg3.16b
+ rev32 v_lane2_msg3.16b, v_lane2_msg3.16b
+ rev32 v_lane3_msg3.16b, v_lane3_msg3.16b
+
+ // rounds 0-3
+ ldr q_key, [x_key_addr]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ str q_lane0_state1, [sp, 64]
+ str q_lane1_state1, [sp, 80]
+ str q_lane2_state1, [sp, 96]
+ str q_lane3_state1, [sp, 112]
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[3], [x_tmp], x_offs
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 4-7
+ ldr q_key, [x_key_addr, 16]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 8-11
+ ldr q_key, [x_key_addr, 32]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 12-15
+ ldr q_key, [x_key_addr, 48]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 16-19
+ ldr q_key, [x_key_addr, 64]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 20-23
+ ldr q_key, [x_key_addr, 80]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 24-27
+ ldr q_key, [x_key_addr, 96]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 28-31
+ ldr q_key, [x_key_addr, 112]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 32-35
+ ldr q_key, [x_key_addr, 128]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 36-39
+ ldr q_key, [x_key_addr, 144]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 40-43
+ ldr q_key, [x_key_addr, 160]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 44-47
+ ldr q_key, [x_key_addr, 176]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 48-51
+ ldr q_key, [x_key_addr, 192]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 52-55
+ ldr q_key, [x_key_addr, 208]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 56-59
+ ldr q_key, [x_key_addr, 224]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 60-63
+ ldr q_key, [x_key_addr, 240]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[3], [x_tmp], x_offs
+
+ add v_lane0_state0.4s, v_lane0_tmp0.4s, v_lane0_state0.4s
+ add v_lane1_state0.4s, v_lane1_tmp0.4s, v_lane1_state0.4s
+ add v_lane2_state0.4s, v_lane2_tmp0.4s, v_lane2_state0.4s
+ add v_lane3_state0.4s, v_lane3_tmp0.4s, v_lane3_state0.4s
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+ ldp q_lane0_tmp2, q_lane1_tmp2, [sp, 64]
+ ldp q_lane2_tmp2, q_lane3_tmp2, [sp, 96]
+
+ add v_lane0_state1.4s, v_lane0_tmp2.4s, v_lane0_state1.4s
+ add v_lane1_state1.4s, v_lane1_tmp2.4s, v_lane1_state1.4s
+ add v_lane2_state1.4s, v_lane2_tmp2.4s, v_lane2_state1.4s
+ add v_lane3_state1.4s, v_lane3_tmp2.4s, v_lane3_state1.4s
+
+ mov x_offs, 64
+ add x_tmp, x_digest_addr, 256
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+ add x_digest_addr, x_digest_addr, 16
+ add x_lane_offs, x_lane_offs, 4
+ cmp x_lane_offs, 16
+ bne .lane_loop
+
+ add x_input_data, x_input_data, 1024
+ cmp x_input_data, x_input_data_end
+ bne .start_loop
+
+ ldp d10, d11, [sp, 16]
+ ldp d12, d13, [sp, 32]
+ ldp d14, d15, [sp, 48]
+ ldp d8, d9, [sp], 192
+.exit:
+ ret
+ .size mh_sha256_block_ce, .-mh_sha256_block_ce
+
+ .section .rodata
+ .align 4
+ .set .key_addr,. + 0
+ .type K, %object
+ .size K, 256
+K:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
new file mode 100644
index 000000000..c42333ed5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+void mh_sha256_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha256_update***********/
+// mh_sha256_update_ce.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_ce
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_ce.c and mh_sha256_tail_ce.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_ce
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_ce
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
new file mode 100644
index 000000000..54eece175
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
new file mode 100644
index 000000000..242c3e218
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+int mh_sha256_init(struct mh_sha256_ctx *ctx)
+{
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+ mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+ mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+ mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+ mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+ mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+ mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+ mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+ || defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha256_update***********/
+// mh_sha256_update_sse.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_sse
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx2.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx2
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+
+// mh_sha256_finalize_sse.c and mh_sha256_tail_sse.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_sse
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_sse
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx.c and mh_sha256_tail_avx.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx2.c and mh_sha256_tail_avx2.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx2
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx2
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+// Version info
+struct slver mh_sha256_init_slver_000002b1;
+struct slver mh_sha256_init_slver = { 0x02b1, 0x00, 0x00 };
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_sse_slver_000002b4;
+struct slver mh_sha256_update_sse_slver = { 0x02b4, 0x00, 0x00 };
+
+struct slver mh_sha256_update_avx_slver_020002b6;
+struct slver mh_sha256_update_avx_slver = { 0x02b6, 0x00, 0x02 };
+
+struct slver mh_sha256_update_avx2_slver_040002b8;
+struct slver mh_sha256_update_avx2_slver = { 0x02b8, 0x00, 0x04 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_sse_slver_000002b5;
+struct slver mh_sha256_finalize_sse_slver = { 0x02b5, 0x00, 0x00 };
+
+struct slver mh_sha256_finalize_avx_slver_020002b7;
+struct slver mh_sha256_finalize_avx_slver = { 0x02b7, 0x00, 0x02 };
+
+struct slver mh_sha256_finalize_avx2_slver_040002b9;
+struct slver mh_sha256_finalize_avx2_slver = { 0x02b9, 0x00, 0x04 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
new file mode 100644
index 000000000..35fb0fbad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha256_update***********/
+// mh_sha256_update_avx512.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx512
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_avx512.c and mh_sha256_tail_avx512.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx512
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx512
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_avx512_slver_060002bc;
+struct slver mh_sha256_update_avx512_slver = { 0x02bc, 0x00, 0x06 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_avx512_slver_060002bd;
+struct slver mh_sha256_finalize_avx512_slver = { 0x02bd, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
new file mode 100644
index 000000000..343ffb024
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha256_internal.h"
+#include <string.h>
+int mh_sha256_update(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+ return mh_sha256_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha256_finalize(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+ return mh_sha256_finalize_base(ctx, mh_sha256_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
new file mode 100644
index 000000000..c2eff350d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa %%T1, [SZ4*(%%i&0xf) + %%data]
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + %%data], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + %%data]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + %%data]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + %%data]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_avx, function, internal
+func(mh_sha256_block_avx)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 8
+ VMOVPS a, [mh_digests_p + I*64 + 16*0]
+ VMOVPS b, [mh_digests_p + I*64 + 16*1]
+ VMOVPS c, [mh_digests_p + I*64 + 16*2]
+ VMOVPS d, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], a
+ vmovdqa [rsp + I*64 + 16*1], b
+ vmovdqa [rsp + I*64 + 16*2], c
+ vmovdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+ VMOVPS TT0,[mh_in_p + I*64+0*16]
+ VMOVPS TT1,[mh_in_p + I*64+1*16]
+ VMOVPS TT2,[mh_in_p + I*64+2*16]
+ VMOVPS TT3,[mh_in_p + I*64+3*16]
+
+ vpshufb TT0, TMP
+ vmovdqa [mh_data_p +(I)*16 +0*256],TT0
+ vpshufb TT1, TMP
+ vmovdqa [mh_data_p +(I)*16 +1*256],TT1
+ vpshufb TT2, TMP
+ vmovdqa [mh_data_p +(I)*16 +2*256],TT2
+ vpshufb TT3, TMP
+ vmovdqa [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ vmovdqa a, [rsp + 0*64 + mh_segs]
+ vmovdqa b, [rsp + 1*64 + mh_segs]
+ vmovdqa c, [rsp + 2*64 + mh_segs]
+ vmovdqa d, [rsp + 3*64 + mh_segs]
+ vmovdqa e, [rsp + 4*64 + mh_segs]
+ vmovdqa f, [rsp + 5*64 + mh_segs]
+ vmovdqa g, [rsp + 6*64 + mh_segs]
+ vmovdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ %if i = 48
+ PREFETCH_X [mh_in_p + pref+128*1]
+ %endif
+ ROUND_16_XX T1, i, mh_data_p
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ vpaddd a, a, [rsp + 0*64 + mh_segs]
+ vpaddd b, b, [rsp + 1*64 + mh_segs]
+ vpaddd c, c, [rsp + 2*64 + mh_segs]
+ vpaddd d, d, [rsp + 3*64 + mh_segs]
+ vpaddd e, e, [rsp + 4*64 + mh_segs]
+ vpaddd f, f, [rsp + 5*64 + mh_segs]
+ vpaddd g, g, [rsp + 6*64 + mh_segs]
+ vpaddd h, h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], a
+ vmovdqa [rsp + 1*64 + mh_segs], b
+ vmovdqa [rsp + 2*64 + mh_segs], c
+ vmovdqa [rsp + 3*64 + mh_segs], d
+ vmovdqa [rsp + 4*64 + mh_segs], e
+ vmovdqa [rsp + 5*64 + mh_segs], f
+ vmovdqa [rsp + 6*64 + mh_segs], g
+ vmovdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 8
+ vmovdqa a, [rsp + I*64 + 16*0]
+ vmovdqa b, [rsp + I*64 + 16*1]
+ vmovdqa c, [rsp + I*64 + 16*2]
+ vmovdqa d, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], a
+ VMOVPS [mh_digests_p + I*64 + 16*1], b
+ VMOVPS [mh_digests_p + I*64 + 16*2], c
+ VMOVPS [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
new file mode 100644
index 000000000..c2b3f2c59
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
@@ -0,0 +1,616 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define SZ 4
+%define SZ8 8*SZ
+%define ROUNDS 64*SZ8
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1 ymm14
+%define TMP ymm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa %%T1, [SZ8*(%%i&0xf) + %%data]
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + %%data], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + %%data]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + %%data]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + %%data]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_avx2, function, internal
+func(mh_sha256_block_avx2)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 4
+ VMOVPS a, [mh_digests_p + I*64*2 + 32*0]
+ VMOVPS b, [mh_digests_p + I*64*2 + 32*1]
+ VMOVPS c, [mh_digests_p + I*64*2 + 32*2]
+ VMOVPS d, [mh_digests_p + I*64*2 + 32*3]
+
+ vmovdqa [rsp + I*64*2 + 32*0], a
+ vmovdqa [rsp + I*64*2 + 32*1], b
+ vmovdqa [rsp + I*64*2 + 32*2], c
+ vmovdqa [rsp + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2
+ %assign I 0
+ %rep 16
+ VMOVPS TT0,[mh_in_p + I*64+0*32]
+ VMOVPS TT1,[mh_in_p + I*64+1*32]
+
+ vpshufb TT0, TT0, TMP
+ vmovdqa [mh_data_p +I*32 +0*512],TT0
+ vpshufb TT1, TT1, TMP
+ vmovdqa [mh_data_p +I*32 +1*512],TT1
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ vmovdqa a, [rsp + 0*64 + mh_segs]
+ vmovdqa b, [rsp + 1*64 + mh_segs]
+ vmovdqa c, [rsp + 2*64 + mh_segs]
+ vmovdqa d, [rsp + 3*64 + mh_segs]
+ vmovdqa e, [rsp + 4*64 + mh_segs]
+ vmovdqa f, [rsp + 5*64 + mh_segs]
+ vmovdqa g, [rsp + 6*64 + mh_segs]
+ vmovdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ ROUND_16_XX T1, i, mh_data_p
+ %if i % 16 = 8
+ PREFETCH_X [mh_in_p + pref+128*(i/16)]
+ %endif
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ vpaddd a, a, [rsp + 0*64 + mh_segs]
+ vpaddd b, b, [rsp + 1*64 + mh_segs]
+ vpaddd c, c, [rsp + 2*64 + mh_segs]
+ vpaddd d, d, [rsp + 3*64 + mh_segs]
+ vpaddd e, e, [rsp + 4*64 + mh_segs]
+ vpaddd f, f, [rsp + 5*64 + mh_segs]
+ vpaddd g, g, [rsp + 6*64 + mh_segs]
+ vpaddd h, h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], a
+ vmovdqa [rsp + 1*64 + mh_segs], b
+ vmovdqa [rsp + 2*64 + mh_segs], c
+ vmovdqa [rsp + 3*64 + mh_segs], d
+ vmovdqa [rsp + 4*64 + mh_segs], e
+ vmovdqa [rsp + 5*64 + mh_segs], f
+ vmovdqa [rsp + 6*64 + mh_segs], g
+ vmovdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 512
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 4
+ vmovdqa a, [rsp + I*64*2 + 32*0]
+ vmovdqa b, [rsp + I*64*2 + 32*1]
+ vmovdqa c, [rsp + I*64*2 + 32*2]
+ vmovdqa d, [rsp + I*64*2 + 32*3]
+
+ VMOVPS [mh_digests_p + I*64*2 + 32*0], a
+ VMOVPS [mh_digests_p + I*64*2 + 32*1], b
+ VMOVPS [mh_digests_p + I*64*2 + 32*2], c
+ VMOVPS [mh_digests_p + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
new file mode 100644
index 000000000..1ee76ddfc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
@@ -0,0 +1,682 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define APPEND(a,b) a %+ b
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha256_block_avx512
+func(mh_sha256_block_avx512)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3F
+ lea TBL,[TABLE]
+
+ ; copy segs_digests into stack and ZMM
+ VMOVPS A, [mh_digests_p + 64*0]
+ VMOVPS B, [mh_digests_p + 64*1]
+ VMOVPS C, [mh_digests_p + 64*2]
+ VMOVPS D, [mh_digests_p + 64*3]
+ VMOVPS E, [mh_digests_p + 64*4]
+ VMOVPS F, [mh_digests_p + 64*5]
+ VMOVPS G, [mh_digests_p + 64*6]
+ VMOVPS H, [mh_digests_p + 64*7]
+
+.block_loop:
+ ; Save digests for later addition
+ vmovdqa32 [rsp + 64*0], A
+ vmovdqa32 [rsp + 64*1], B
+ vmovdqa32 [rsp + 64*2], C
+ vmovdqa32 [rsp + 64*3], D
+ vmovdqa32 [rsp + 64*4], E
+ vmovdqa32 [rsp + 64*5], F
+ vmovdqa32 [rsp + 64*6], G
+ vmovdqa32 [rsp + 64*7], H
+
+ vmovdqa32 TMP3, [TBL] ; First K
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;using extra 16 ZMM registers instead of heap
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+ vpshufb APPEND(W,J), APPEND(W,J), TMP2
+%assign I (I+2)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ %if I < 48
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %endif
+ %if I % 8 = 4
+ PREFETCH_X [mh_in_p + 1024+128*(I / 8)]
+ %endif
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ;; add old digest
+ vpaddd A, A, [rsp + 0*64]
+ vpaddd B, B, [rsp + 1*64]
+ vpaddd C, C, [rsp + 2*64]
+ vpaddd D, D, [rsp + 3*64]
+ vpaddd E, E, [rsp + 4*64]
+ vpaddd F, F, [rsp + 5*64]
+ vpaddd G, G, [rsp + 6*64]
+ vpaddd H, H, [rsp + 7*64]
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ; copy segs_digests back to mh_digests_p
+
+ VMOVPS [mh_digests_p + 64*0], A
+ VMOVPS [mh_digests_p + 64*1], B
+ VMOVPS [mh_digests_p + 64*2], C
+ VMOVPS [mh_digests_p + 64*3], D
+ VMOVPS [mh_digests_p + 64*4], E
+ VMOVPS [mh_digests_p + 64*5], F
+ VMOVPS [mh_digests_p + 64*6], G
+ VMOVPS [mh_digests_p + 64*7], H
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha256_block_avx512
+no_mh_sha256_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
new file mode 100644
index 000000000..8d9a828c6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// store_w is only used for step 0 ~ 15
+#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s]))
+#define Ws(x, s) w[(x) & 15][s]
+// update_w is used for step > 15
+#define update_w(s, i, w) \
+ Ws(i, s) = Ws(i-16, s) + S0(Ws(i-15, s)) + Ws(i-7, s) + S1(Ws(i-2, s))
+#define update_t2(s, a, b, c) t2[s] = s0(a[s]) + maj(a[s],b[s],c[s])
+#define update_t1(s, h, e, f, g, i, k) \
+ t1[s] = h[s] + s1(e[s]) + ch(e[s],f[s],g[s]) + k + Ws(i, s);
+#define update_d(s) d[s] += t1[s]
+#define update_h(s) h[s] = t1[s] + t2[s]
+
+// s is a iterator
+#define STORE_W(s, i, w, ww) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ store_w(s, i, w, ww);
+#define UPDATE_W(s, i, w) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_w(s, i, w);
+#define UPDATE_T2(s, a, b, c) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_t2(s, a, b, c);
+#define UPDATE_T1(s, h, e, f, g, i, k) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_t1(s, h, e, f, g, i, k);
+#define UPDATE_D(s) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_d(s);
+#define UPDATE_H(s) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_h(s);
+
+static inline void step(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t * f,
+ uint32_t * g, uint32_t * h, uint32_t k,
+ uint32_t * t1, uint32_t * t2, uint32_t(*w)[HASH_SEGS], uint32_t * ww)
+{
+ uint8_t s;
+ if (i < 16) {
+ STORE_W(s, i, w, ww);
+ } else {
+ UPDATE_W(s, i, w);
+ }
+ UPDATE_T2(s, a, b, c);
+ UPDATE_T1(s, h, e, f, g, i, k);
+ UPDATE_D(s);
+ UPDATE_H(s);
+}
+
+static inline void init_abcdefgh(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+ uint8_t s;
+ for (s = 0; s < HASH_SEGS; s++)
+ xx[s] = digests[n][s];
+}
+
+static inline void add_abcdefgh(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+ uint8_t s;
+ for (s = 0; s < HASH_SEGS; s++)
+ digests[n][s] += xx[s];
+}
+
+/*
+ * API to perform 0-64 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ * input - the pointer to the data
+ * digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ * N/A
+ */
+void mh_sha256_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+ uint8_t * frame_buffer)
+{
+ uint8_t i;
+ uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS];
+ uint32_t ee[HASH_SEGS], ff[HASH_SEGS], gg[HASH_SEGS], hh[HASH_SEGS];
+ uint32_t t1[HASH_SEGS], t2[HASH_SEGS];
+ uint32_t *ww = (uint32_t *) input;
+ uint32_t(*w)[HASH_SEGS];
+
+ const static uint32_t k[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ };
+
+ w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+ init_abcdefgh(aa, 0, digests);
+ init_abcdefgh(bb, 1, digests);
+ init_abcdefgh(cc, 2, digests);
+ init_abcdefgh(dd, 3, digests);
+ init_abcdefgh(ee, 4, digests);
+ init_abcdefgh(ff, 5, digests);
+ init_abcdefgh(gg, 6, digests);
+ init_abcdefgh(hh, 7, digests);
+
+ for (i = 0; i < 64; i += 8) {
+ step(i, aa, bb, cc, dd, ee, ff, gg, hh, k[i], t1, t2, w, ww);
+ step(i + 1, hh, aa, bb, cc, dd, ee, ff, gg, k[i + 1], t1, t2, w, ww);
+ step(i + 2, gg, hh, aa, bb, cc, dd, ee, ff, k[i + 2], t1, t2, w, ww);
+ step(i + 3, ff, gg, hh, aa, bb, cc, dd, ee, k[i + 3], t1, t2, w, ww);
+ step(i + 4, ee, ff, gg, hh, aa, bb, cc, dd, k[i + 4], t1, t2, w, ww);
+ step(i + 5, dd, ee, ff, gg, hh, aa, bb, cc, k[i + 5], t1, t2, w, ww);
+ step(i + 6, cc, dd, ee, ff, gg, hh, aa, bb, k[i + 6], t1, t2, w, ww);
+ step(i + 7, bb, cc, dd, ee, ff, gg, hh, aa, k[i + 7], t1, t2, w, ww);
+ }
+
+ add_abcdefgh(aa, 0, digests);
+ add_abcdefgh(bb, 1, digests);
+ add_abcdefgh(cc, 2, digests);
+ add_abcdefgh(dd, 3, digests);
+ add_abcdefgh(ee, 4, digests);
+ add_abcdefgh(ff, 5, digests);
+ add_abcdefgh(gg, 6, digests);
+ add_abcdefgh(hh, 7, digests);
+}
+
+void mh_sha256_block_base(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i;
+
+ for (i = 0; i < num_blocks; i++) {
+ mh_sha256_single(input_data, digests, frame_buffer);
+ input_data += MH_SHA256_BLOCK_SIZE;
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
new file mode 100644
index 000000000..b1d6fd9ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa %%T1,[SZ4*(%%i&0xf) + %%data]
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + %%data], %%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + %%data]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + %%data]
+ paddd a1, [SZ4*((%%i-7)&0xf) + %%data]
+ paddd %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_sse, function, internal
+func(mh_sha256_block_sse)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 8
+ MOVPS a, [mh_digests_p + I*64 + 16*0]
+ MOVPS b, [mh_digests_p + I*64 + 16*1]
+ MOVPS c, [mh_digests_p + I*64 + 16*2]
+ MOVPS d, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], a
+ movdqa [rsp + I*64 + 16*1], b
+ movdqa [rsp + I*64 + 16*2], c
+ movdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+ MOVPS TT0,[mh_in_p + I*64+0*16]
+ MOVPS TT1,[mh_in_p + I*64+1*16]
+ MOVPS TT2,[mh_in_p + I*64+2*16]
+ MOVPS TT3,[mh_in_p + I*64+3*16]
+
+ pshufb TT0, TMP
+ movdqa [mh_data_p +(I)*16 +0*256],TT0
+ pshufb TT1, TMP
+ movdqa [mh_data_p +(I)*16 +1*256],TT1
+ pshufb TT2, TMP
+ movdqa [mh_data_p +(I)*16 +2*256],TT2
+ pshufb TT3, TMP
+ movdqa [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ movdqa a, [rsp + 0*64 + mh_segs]
+ movdqa b, [rsp + 1*64 + mh_segs]
+ movdqa c, [rsp + 2*64 + mh_segs]
+ movdqa d, [rsp + 3*64 + mh_segs]
+ movdqa e, [rsp + 4*64 + mh_segs]
+ movdqa f, [rsp + 5*64 + mh_segs]
+ movdqa g, [rsp + 6*64 + mh_segs]
+ movdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ %if i = 48
+ PREFETCH_X [mh_in_p + pref+128*1]
+ %endif
+ ROUND_16_XX T1, i, mh_data_p
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ paddd a, [rsp + 0*64 + mh_segs]
+ paddd b, [rsp + 1*64 + mh_segs]
+ paddd c, [rsp + 2*64 + mh_segs]
+ paddd d, [rsp + 3*64 + mh_segs]
+ paddd e, [rsp + 4*64 + mh_segs]
+ paddd f, [rsp + 5*64 + mh_segs]
+ paddd g, [rsp + 6*64 + mh_segs]
+ paddd h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], a
+ movdqa [rsp + 1*64 + mh_segs], b
+ movdqa [rsp + 2*64 + mh_segs], c
+ movdqa [rsp + 3*64 + mh_segs], d
+ movdqa [rsp + 4*64 + mh_segs], e
+ movdqa [rsp + 5*64 + mh_segs], f
+ movdqa [rsp + 6*64 + mh_segs], g
+ movdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 8
+ movdqa a, [rsp + I*64 + 16*0]
+ movdqa b, [rsp + I*64 + 16*1]
+ movdqa c, [rsp + I*64 + 16*2]
+ movdqa d, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], a
+ MOVPS [mh_digests_p + I*64 + 16*1], b
+ MOVPS [mh_digests_p + I*64 + 16*2], c
+ MOVPS [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
new file mode 100644
index 000000000..6abb20688
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
@@ -0,0 +1,121 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_finalize_base.c contains the prototypes of mh_sha256_finalize_XXX
+ * and mh_sha256_tail_XXX. Default definitions are base type which generates
+ * mh_sha256_finalize_base and mh_sha256_tail_base. Other types are generated
+ * through different predefined macros by mh_sha256.c.
+ * mh_sha256_tail is used to calculate the last incomplete block of input
+ * data. mh_sha256_finalize is the mh_sha256_ctx wrapper of mh_sha256_tail.
+ */
+#ifndef MH_SHA256_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_base
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_base
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base
+#define MH_SHA256_FINALIZE_SLVER
+#endif
+
+void MH_SHA256_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t * frame_buffer, uint32_t digests[SHA256_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+ MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer,
+ 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+ MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA256 digests (segment digests as input message)
+ sha256_for_mh_sha256((uint8_t *) mh_sha256_segs_digests, digests,
+ 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+int MH_SHA256_FINALIZE_FUNCTION(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+ uint8_t i;
+ uint8_t *partial_block_buffer;
+ uint64_t total_len;
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ /* mh_sha256 tail */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+ MH_SHA256_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha256_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha256_digest);
+
+ /* Output the digests of mh_sha256 */
+ if (mh_sha256_digest != NULL) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++)
+ ((uint32_t *) mh_sha256_digest)[i] = ctx->mh_sha256_digest[i];
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA256_FINALIZE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_finalize_base_slver_000002bb;
+struct slver mh_sha256_finalize_base_slver = { 0x02bb, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
new file mode 100644
index 000000000..8051e3f36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
@@ -0,0 +1,318 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA256_INTERNAL_H_
+#define _MH_SHA256_INTERNAL_H_
+
+/**
+ * @file mh_sha256_internal.h
+ * @brief mh_sha256 internal function prototypes and macros
+ *
+ * Interface for mh_sha256 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha256.h"
+#include "endian_helper.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+ *mh_sha256 constants and macros
+ ******************************************************************/
+ /* mh_sha256 constants */
+#define MH_SHA256_H0 0x6a09e667UL
+#define MH_SHA256_H1 0xbb67ae85UL
+#define MH_SHA256_H2 0x3c6ef372UL
+#define MH_SHA256_H3 0xa54ff53aUL
+#define MH_SHA256_H4 0x510e527fUL
+#define MH_SHA256_H5 0x9b05688cUL
+#define MH_SHA256_H6 0x1f83d9abUL
+#define MH_SHA256_H7 0x5be0cd19UL
+
+ /* mh_sha256 macros */
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+ /*******************************************************************
+ * SHA256 API internal function prototypes
+ ******************************************************************/
+
+ /**
+ * @brief Performs complete SHA256 algorithm.
+ *
+ * @param input Pointer to buffer containing the input message.
+ * @param digest Pointer to digest to update.
+ * @param len Length of buffer.
+ * @returns None
+ */
+ void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /**
+ * @brief Calculate sha256 digest of blocks which size is SHA256_BLOCK_SIZE
+ *
+ * @param data Pointer to data buffer containing the input message.
+ * @param digest Pointer to sha256 digest.
+ * @returns None
+ */
+ void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]);
+
+ /*******************************************************************
+ * mh_sha256 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @returns none
+ *
+ */
+ void mh_sha256_tail(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires SSE
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX2
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX512
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_base(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires SSE
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
new file mode 100644
index 000000000..e14fc7eb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha256_update_sse
+ extern mh_sha256_update_avx
+ extern mh_sha256_update_avx2
+ extern mh_sha256_finalize_sse
+ extern mh_sha256_finalize_avx
+ extern mh_sha256_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha256_update_avx512
+ extern mh_sha256_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha256_update_base
+extern mh_sha256_finalize_base
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2, mh_sha256_update_avx512
+ mbin_dispatch_init6 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2, mh_sha256_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2
+ mbin_dispatch_init5 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha256_update, mh_sha256_update_base
+ mbin_dispatch_init2 mh_sha256_finalize, mh_sha256_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha256_update, 00, 00, 02b2
+slversion mh_sha256_finalize, 00, 00, 02b3
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
new file mode 100644
index 000000000..8095e4f05
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 16*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 32 == 0)
+ printf("\n");
+ }
+ if (i % 32 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 32);
+ printf("ref: ");
+ dump((char *)hash_test, 32);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_base[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha256_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx_test = malloc(sizeof(*update_ctx_test));
+ update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+ if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha256 base version
+ mh_sha256_init(update_ctx_base);
+ mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha256_finalize_base(update_ctx_base, hash_base);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha256_init(update_ctx_base);
+ mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha256_finalize_base(update_ctx_base, hash_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha256_update_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha256_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha256_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
new file mode 100644
index 000000000..2aaefecb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
@@ -0,0 +1,410 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ // Macros and sub-functions which already exist in source code file
+ // (sha256_for_mh_sha256.c) is part of ISA-L library as internal functions.
+ // The reason why writing them twice is the linking issue caused by
+ // mh_sha256_ref(). mh_sha256_ref() needs these macros and sub-functions
+ // without linking ISA-L library. So mh_sha256_ref() includes them in
+ // order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha256_single_for_mh_sha256_ref(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+void sha256_for_mh_sha256_ref(const uint8_t * input_data, uint32_t * digest,
+ const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = MH_SHA256_H0;
+ digest[1] = MH_SHA256_H1;
+ digest[2] = MH_SHA256_H2;
+ digest[3] = MH_SHA256_H3;
+ digest[4] = MH_SHA256_H4;
+ digest[5] = MH_SHA256_H5;
+ digest[6] = MH_SHA256_H6;
+ digest[7] = MH_SHA256_H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single_for_mh_sha256_ref(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - 8)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single_for_mh_sha256_ref(buf, digest);
+ if (i == (2 * SHA256_BLOCK_SIZE))
+ sha256_single_for_mh_sha256_ref(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+ uint32_t segment)
+{
+ new_data[16 * segment + 0] = input[16 * 0 + segment];
+ new_data[16 * segment + 1] = input[16 * 1 + segment];
+ new_data[16 * segment + 2] = input[16 * 2 + segment];
+ new_data[16 * segment + 3] = input[16 * 3 + segment];
+ new_data[16 * segment + 4] = input[16 * 4 + segment];
+ new_data[16 * segment + 5] = input[16 * 5 + segment];
+ new_data[16 * segment + 6] = input[16 * 6 + segment];
+ new_data[16 * segment + 7] = input[16 * 7 + segment];
+ new_data[16 * segment + 8] = input[16 * 8 + segment];
+ new_data[16 * segment + 9] = input[16 * 9 + segment];
+ new_data[16 * segment + 10] = input[16 * 10 + segment];
+ new_data[16 * segment + 11] = input[16 * 11 + segment];
+ new_data[16 * segment + 12] = input[16 * 12 + segment];
+ new_data[16 * segment + 13] = input[16 * 13 + segment];
+ new_data[16 * segment + 14] = input[16 * 14 + segment];
+ new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha256_single_for_mh_sha256_ref
+#define sha256_update_one_seg(data, digest) \
+ sha256_single_for_mh_sha256_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * seg0: | w0 | w1 | ... | w15
+ * seg1: | w0 | w1 | ... | w15
+ * seg2: | w0 | w1 | ... | w15
+ * ....
+ * seg15: | w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+ uint32_t *current_input = input + block * MH_SHA256_BLOCK_SIZE / 4;
+
+ transform_input_single(new_data, current_input, 0);
+ transform_input_single(new_data, current_input, 1);
+ transform_input_single(new_data, current_input, 2);
+ transform_input_single(new_data, current_input, 3);
+ transform_input_single(new_data, current_input, 4);
+ transform_input_single(new_data, current_input, 5);
+ transform_input_single(new_data, current_input, 6);
+ transform_input_single(new_data, current_input, 7);
+ transform_input_single(new_data, current_input, 8);
+ transform_input_single(new_data, current_input, 9);
+ transform_input_single(new_data, current_input, 10);
+ transform_input_single(new_data, current_input, 11);
+ transform_input_single(new_data, current_input, 12);
+ transform_input_single(new_data, current_input, 13);
+ transform_input_single(new_data, current_input, 14);
+ transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ * segment
+ * -------------------------
+ * seg0: | H0 | H1 | ... | H7
+ * seg1: | H0 | H1 | ... | H7
+ * seg2: | H0 | H1 | ... | H7
+ * ....
+ * seg15: | H0 | H1 | ... | H7
+ *
+ */
+static inline void sha256_update_all_segs(uint32_t * new_data, uint32_t(*mh_sha256_seg_digests)
+ [SHA256_DIGEST_WORDS])
+{
+ sha256_update_one_seg(&(new_data)[16 * 0], mh_sha256_seg_digests[0]);
+ sha256_update_one_seg(&(new_data)[16 * 1], mh_sha256_seg_digests[1]);
+ sha256_update_one_seg(&(new_data)[16 * 2], mh_sha256_seg_digests[2]);
+ sha256_update_one_seg(&(new_data)[16 * 3], mh_sha256_seg_digests[3]);
+ sha256_update_one_seg(&(new_data)[16 * 4], mh_sha256_seg_digests[4]);
+ sha256_update_one_seg(&(new_data)[16 * 5], mh_sha256_seg_digests[5]);
+ sha256_update_one_seg(&(new_data)[16 * 6], mh_sha256_seg_digests[6]);
+ sha256_update_one_seg(&(new_data)[16 * 7], mh_sha256_seg_digests[7]);
+ sha256_update_one_seg(&(new_data)[16 * 8], mh_sha256_seg_digests[8]);
+ sha256_update_one_seg(&(new_data)[16 * 9], mh_sha256_seg_digests[9]);
+ sha256_update_one_seg(&(new_data)[16 * 10], mh_sha256_seg_digests[10]);
+ sha256_update_one_seg(&(new_data)[16 * 11], mh_sha256_seg_digests[11]);
+ sha256_update_one_seg(&(new_data)[16 * 12], mh_sha256_seg_digests[12]);
+ sha256_update_one_seg(&(new_data)[16 * 13], mh_sha256_seg_digests[13]);
+ sha256_update_one_seg(&(new_data)[16 * 14], mh_sha256_seg_digests[14]);
+ sha256_update_one_seg(&(new_data)[16 * 15], mh_sha256_seg_digests[15]);
+}
+
+void mh_sha256_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i, j;
+ uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+ uint32_t(*trans_digests)[SHA256_DIGEST_WORDS];
+
+ trans_digests = (uint32_t(*)[SHA256_DIGEST_WORDS]) digests;
+
+ // Re-structure seg_digests from 5*16 to 16*5
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ temp_buffer[j * SHA256_DIGEST_WORDS + i] = digests[i][j];
+ }
+ }
+ memcpy(trans_digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ // Calculate digests for all segments, leveraging sha256 API
+ for (i = 0; i < num_blocks; i++) {
+ transform_input(temp_buffer, (uint32_t *) input_data, i);
+ sha256_update_all_segs(temp_buffer, trans_digests);
+ }
+
+ // Re-structure seg_digests from 16*5 to 5*16
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+ }
+ }
+ memcpy(digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha256_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA256_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+ mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+ mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA256 digests (segment digests as input message)
+ sha256_for_mh_sha256_ref((uint8_t *) mh_sha256_segs_digests, digests,
+ 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest)
+{
+ uint64_t total_len;
+ uint64_t num_blocks;
+ uint32_t mh_sha256_segs_digests[SHA256_DIGEST_WORDS][HASH_SEGS];
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE];
+ uint8_t partial_block_buffer[MH_SHA256_BLOCK_SIZE * 2];
+ uint32_t mh_sha256_hash_dword[SHA256_DIGEST_WORDS];
+ uint32_t i;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ /* Initialize digests of all segments */
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+ mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+ mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+ mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+ mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+ mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+ mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+ mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+ }
+
+ total_len = len;
+
+ // Calculate blocks
+ num_blocks = len / MH_SHA256_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ mh_sha256_block_ref(input_data, mh_sha256_segs_digests, frame_buffer,
+ num_blocks);
+ len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ /* Finalize */
+ mh_sha256_tail_ref(partial_block_buffer, total_len, mh_sha256_segs_digests,
+ frame_buffer, mh_sha256_hash_dword);
+
+ // Output the digests of mh_sha256
+ if (mh_sha256_digest != NULL) {
+ mh_sha256_digest[0] = mh_sha256_hash_dword[0];
+ mh_sha256_digest[1] = mh_sha256_hash_dword[1];
+ mh_sha256_digest[2] = mh_sha256_hash_dword[2];
+ mh_sha256_digest[3] = mh_sha256_hash_dword[3];
+ mh_sha256_digest[4] = mh_sha256_hash_dword[4];
+ mh_sha256_digest[5] = mh_sha256_hash_dword[5];
+ mh_sha256_digest[6] = mh_sha256_hash_dword[6];
+ mh_sha256_digest[7] = mh_sha256_hash_dword[7];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
new file mode 100644
index 000000000..13ab91c16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+#define MH_SHA256_REF mh_sha256_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 32 == 0)
+ printf("\n");
+ }
+ if (i % 32 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 32);
+ printf("test: ");
+ dump((char *)hash_test, 32);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha256_ctx *update_ctx = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ MH_SHA256_REF(buff, TEST_LEN, hash_ref);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ MH_SHA256_REF(buff, size, hash_ref);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ MH_SHA256_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+
+ MH_SHA256_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+ printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
new file mode 100644
index 000000000..024ae2b91
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_update_base.c contains the prototype of mh_sha256_update_XXX.
+ * Default definitions are base type which generates mh_sha256_update_base.
+ * Other types are generated through different predefined macros by mh_sha256.c.
+ */
+#ifndef MH_SHA256_UPDATE_FUNCTION
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_base
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base
+#define MH_SHA256_UPDATE_SLVER
+#endif
+
+int MH_SHA256_UPDATE_FUNCTION(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA256_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA256_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha256 calculation
+ if (len + partial_block_len < MH_SHA256_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA256_CTX_ERROR_NONE;
+ }
+ // mh_sha256 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA256_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ MH_SHA256_BLOCK_FUNCTION(partial_block_buffer, mh_sha256_segs_digests,
+ aligned_frame_buffer, 1);
+ input_data += MH_SHA256_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA256_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ // Calculate mh_sha256 for the current blocks
+ num_blocks = len / MH_SHA256_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ MH_SHA256_BLOCK_FUNCTION(input_data, mh_sha256_segs_digests,
+ aligned_frame_buffer, num_blocks);
+ len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA256_UPDATE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_update_base_slver_000002ba;
+struct slver mh_sha256_update_base_slver = { 0x02ba, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
new file mode 100644
index 000000000..f5b28bba7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 20);
+ printf("test: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha256_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha256_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
new file mode 100644
index 000000000..ea8c9f436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
@@ -0,0 +1,176 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions for mh_sha256
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = MH_SHA256_H0;
+ digest[1] = MH_SHA256_H1;
+ digest[2] = MH_SHA256_H2;
+ digest[3] = MH_SHA256_H3;
+ digest[4] = MH_SHA256_H4;
+ digest[5] = MH_SHA256_H5;
+ digest[6] = MH_SHA256_H6;
+ digest[7] = MH_SHA256_H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single_for_mh_sha256(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - 8)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single_for_mh_sha256(buf, digest);
+ if (i == (2 * SHA256_BLOCK_SIZE))
+ sha256_single_for_mh_sha256(buf + SHA256_BLOCK_SIZE, digest);
+}