summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/sm3_mb
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sm3_mb')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am121
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S387
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S576
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c241
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c188
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S237
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S344
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S368
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S440
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c284
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c292
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c314
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c145
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm77
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm258
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm276
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c206
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c298
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c128
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c133
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm1035
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm711
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm81
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c207
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c45
35 files changed, 9038 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
new file mode 100644
index 000000000..8f8a3f4a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
@@ -0,0 +1,121 @@
+########################################################################
+# Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/sm3_multibinary.asm
+
+lsrc_base_aliases += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/sm3_ctx_base_aliases.c
+
+lsrc_aarch64 += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c \
+ sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S \
+ sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_sm_x1.S \
+ sm3_mb/aarch64/sm3_mb_sm_x2.S \
+ sm3_mb/aarch64/sm3_mb_sm_x3.S \
+ sm3_mb/aarch64/sm3_mb_sm_x4.S \
+ sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_asimd_x1.S \
+ sm3_mb/aarch64/sm3_mb_asimd_x4.S
+
+
+src_include += -I $(srcdir)/sm3_mb
+
+extern_hdrs += include/sm3_mb.h \
+ include/multi_buffer.h
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_avx512.c \
+ sm3_mb/sm3_mb_mgr_submit_avx512.asm \
+ sm3_mb/sm3_mb_mgr_flush_avx512.asm \
+ sm3_mb/sm3_mb_x16_avx512.asm
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_avx2.c \
+ sm3_mb/sm3_mb_mgr_submit_avx2.asm \
+ sm3_mb/sm3_mb_mgr_flush_avx2.asm \
+ sm3_mb/sm3_mb_x8_avx2.asm
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ include/reg_sizes.asm \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h \
+ sm3_mb/sm3_job.asm \
+ sm3_mb/sm3_mb_mgr_datastruct.asm \
+ sm3_mb/sm3_test_helper.c
+
+check_tests += sm3_mb/sm3_ref_test
+
+unit_tests += sm3_mb/sm3_mb_rand_ssl_test \
+ sm3_mb/sm3_mb_rand_test \
+ sm3_mb/sm3_mb_rand_update_test \
+ sm3_mb/sm3_mb_flush_test \
+ sm3_mb/sm3_mb_test
+
+perf_tests += sm3_mb/sm3_mb_vs_ossl_perf \
+ sm3_mb/sm3_mb_vs_ossl_shortage_perf
+
+sm3_mb_rand_ssl_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_ssl_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_ssl_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_update_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_update_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_update_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_update_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_flush_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_flush_test_LDFLAGS = -lcrypto
+
+sm3_mb_flush_test: sm3_test_helper.o
+sm3_mb_sm3_mb_flush_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_shortage_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..208a7414e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2019-2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_submit_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_submit_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_init_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_init_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_flush_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_flush_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
new file mode 100644
index 000000000..c7362de90
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
@@ -0,0 +1,387 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job .req x0
+ len .req x1
+ data .req x2
+ digest .req x0
+
+ msg0 .req w3
+ msg1 .req w4
+ msg2 .req w5
+ msg3 .req w6
+ msg4 .req w7
+
+ msg .req w9
+ msgP .req w10
+ SS1 .req w11
+ SS2 .req w12
+ TT1 .req w13
+ TT2 .req w14
+ Tj .req w15
+ tmp0 .req w19
+ tmp1 .req w20
+ dig_A .req w21
+ dig_B .req w22
+ dig_C .req w23
+ dig_D .req w24
+ dig_E .req w25
+ dig_F .req w26
+ dig_G .req w27
+ dig_H .req w28
+
+ declare_var_vector_reg dig0,0
+ declare_var_vector_reg dig1,1
+ declare_var_vector_reg dig0_bak,2
+ declare_var_vector_reg dig1_bak,3
+ declare_var_vector_reg vect_msg0,4
+ declare_var_vector_reg vect_msg1,5
+ declare_var_vector_reg vect_msg2,6
+ declare_var_vector_reg vect_msg3,7
+
+ declare_var_vector_reg vect_msgP0,16
+ declare_var_vector_reg vect_msgP1,17
+ declare_var_vector_reg vect_msgP2,18
+
+
+
+
+
+
+// round 0-11
+.macro sm3_round_0 round:req
+ ldr msg, [sp,msg_off+4*\round\()]
+ ldr msgP,[sp,wp_off +4*\round\()]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,msgP
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor TT1,TT1,dig_C
+ add SS2,SS2,dig_D
+ add SS1,SS1,dig_H
+ add TT1,TT1,SS2
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,(32-1)
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,dig_D
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ eor TT1,TT1,dig_C
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+//round 63
+.macro sm3_round_63 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ins vdig0_bak.s[3],dig_C
+ ror dig_C,dig_B,32-9
+ ins vdig0_bak.s[1],dig_A
+ ins vdig0_bak.s[0],TT1
+ ins vdig0_bak.s[2],dig_C
+ eor TT1,TT2,TT2,ror (32-17)
+ ins vdig1_bak.s[3],dig_G
+ ror dig_G,dig_F,32-19
+ ins vdig1_bak.s[1],dig_E
+ ins vdig1_bak.s[2],dig_G
+ eor dig_E,TT1,TT2,ror(32-9)
+ ins vdig1_bak.s[0],dig_E
+.endm
+
+ .set wp_off , 96
+ .set msg_off, 96 + 12*4
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x1
+ .type sm3_mb_asimd_x1, %function
+sm3_mb_asimd_x1:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ ldr data,[job],64
+ ldp qdig0,qdig1,[digest]
+ stp x19, x20, [sp, 16]
+ stp x21, x22, [sp, 32]
+ rev32 vdig0.16b,vdig0.16b
+ stp x23, x24, [sp, 48]
+ rev32 vdig1.16b,vdig1.16b
+ stp x25, x26, [sp, 64]
+ stp x27, x28, [sp, 80]
+ ble .exit_func
+
+.start_loop:
+
+ /** prepare first 12 round data **/
+ ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64
+ mov Tj, 17689
+ umov dig_A,vdig0.s[0]
+ movk Tj, 0x79cc, lsl 16
+ rev32 vvect_msg0.16b,vvect_msg0.16b
+ umov dig_B,vdig0.s[1]
+ rev32 vvect_msg1.16b,vvect_msg1.16b
+ umov dig_C,vdig0.s[2]
+ rev32 vvect_msg2.16b,vvect_msg2.16b
+ umov dig_D,vdig0.s[3]
+ rev32 vvect_msg3.16b,vvect_msg3.16b
+ umov dig_E,vdig1.s[0]
+ stp qvect_msg0,qvect_msg1,[sp,msg_off]
+ umov dig_F,vdig1.s[1]
+ stp qvect_msg2,qvect_msg3,[sp,msg_off+32]
+ umov dig_G,vdig1.s[2]
+ eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b
+ eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b
+ umov dig_H,vdig1.s[3]
+ stp qvect_msgP0,qvect_msgP1,[sp,wp_off]
+ eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b
+ str qvect_msgP2,[sp,wp_off+32]
+
+ sm3_round_0 0
+ sm3_round_0 1
+ sm3_round_0 2
+ sm3_round_0 3
+ sm3_round_0 4
+ sm3_round_0 5
+ sm3_round_0 6
+ sm3_round_0 7
+ sm3_round_0 8
+ sm3_round_0 9
+ sm3_round_0 10
+ sm3_round_0 11
+
+ sm3_round_12 12
+ sm3_round_12 13
+ sm3_round_12 14
+ sm3_round_12 15
+ mov Tj, 0x7a87
+ movk Tj, 0x9d8a, lsl 16
+ sm3_round_16 16
+ sm3_round_16 17
+ sm3_round_16 18
+ sm3_round_16 19
+ sm3_round_16 20
+ sm3_round_16 21
+ sm3_round_16 22
+ sm3_round_16 23
+ sm3_round_16 24
+ sm3_round_16 25
+ sm3_round_16 26
+ sm3_round_16 27
+ sm3_round_16 28
+ sm3_round_16 29
+ sm3_round_16 30
+ sm3_round_16 31
+ sm3_round_16 32
+ sm3_round_16 33
+ sm3_round_16 34
+ sm3_round_16 35
+ sm3_round_16 36
+ sm3_round_16 37
+ sm3_round_16 38
+ sm3_round_16 39
+ sm3_round_16 40
+ sm3_round_16 41
+ sm3_round_16 42
+ sm3_round_16 43
+ sm3_round_16 44
+ sm3_round_16 45
+ sm3_round_16 46
+ sm3_round_16 47
+ sm3_round_16 48
+ sm3_round_16 49
+ sm3_round_16 50
+ sm3_round_16 51
+ sm3_round_16 52
+ sm3_round_16 53
+ sm3_round_16 54
+ sm3_round_16 55
+ sm3_round_16 56
+ sm3_round_16 57
+ sm3_round_16 58
+ sm3_round_16 59
+ sm3_round_16 60
+ sm3_round_16 61
+ sm3_round_16 62
+ sm3_round_63 63
+ subs len,len,1
+ eor vdig0.16b,vdig0.16b,vdig0_bak.16b
+ eor vdig1.16b,vdig1.16b,vdig1_bak.16b
+ bne .start_loop
+.exit_func:
+ ldp x19, x20, [sp, 16]
+ rev32 vdig0.16b,vdig0.16b
+ ldp x21, x22, [sp, 32]
+ rev32 vdig1.16b,vdig1.16b
+ ldp x23, x24, [sp, 48]
+ stp qdig0,qdig1,[digest]
+ ldp x25, x26, [sp, 64]
+ ldp x27, x28, [sp, 80]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+ .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
new file mode 100644
index 000000000..975a07c7a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
@@ -0,0 +1,576 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ len .req x4
+
+ job0_data .req x5
+ job1_data .req x6
+ job2_data .req x7
+ job3_data .req x9
+
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+ job3_digest .req x3
+ job0_tmp .req x10
+ job1_tmp .req x11
+ job2_tmp .req x12
+ job3_tmp .req x13
+ const_adr .req x14
+
+
+ declare_var_vector_reg msg0,0
+ declare_var_vector_reg msg1,1
+ declare_var_vector_reg msg2,2
+ declare_var_vector_reg msg3,3
+ declare_var_vector_reg msg4,4
+ declare_var_vector_reg msg5,5
+ declare_var_vector_reg msg6,6
+ declare_var_vector_reg msg7,7
+ declare_var_vector_reg msg8,8
+ declare_var_vector_reg msg9,9
+ declare_var_vector_reg msg10,10
+ declare_var_vector_reg msg11,11
+ declare_var_vector_reg msg12,12
+ declare_var_vector_reg msg13,13
+ declare_var_vector_reg msg14,14
+ declare_var_vector_reg msg15,15
+ declare_var_vector_reg msg16,16
+
+
+ declare_var_vector_reg dig_A,24
+ declare_var_vector_reg dig_B,25
+ declare_var_vector_reg dig_C,26
+ declare_var_vector_reg dig_D,27
+ declare_var_vector_reg dig_E,28
+ declare_var_vector_reg dig_F,29
+ declare_var_vector_reg dig_G,30
+ declare_var_vector_reg dig_H,31
+
+ declare_var_vector_reg TT1,17
+ declare_var_vector_reg TT2,18
+ declare_var_vector_reg SS1,19
+ declare_var_vector_reg SS2,20
+ declare_var_vector_reg tmp0,21
+ declare_var_vector_reg word_pair,23
+ declare_var_vector_reg Tj,22
+
+
+.macro rol32 target:req,reg:req,bit:req
+ ushr v\target\().4s,v\reg\().4s,32 - \bit
+ sli v\target\().4s,v\reg\().4s,\bit
+.endm
+
+// round 0-11
+.macro sm3_round_0 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\round\().16b,vmsg\round\().16b
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+
+.macro sm3_round_4 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ mov vTT2.16b,vdig_E.16b
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+//round 63
+.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+
+ ldp qmsg0,qmsg1,[sp,dig_off+ 0]
+ mov vTT2.16b,vdig_E.16b
+ ldp qmsg2,qmsg3,[sp,dig_off+ 32]
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ ldp qmsg4,qmsg5,[sp,dig_off+ 64]
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ ldp qmsg6,qmsg7,[sp,dig_off+ 96]
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ //D=C
+ eor vdig_D.16b,vdig_C.16b,vmsg3.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ eor vdig_C.16b,vdig_C.16b,vmsg2.16b
+ //B=A
+ eor vdig_B.16b,vdig_A.16b,vmsg1.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ //A=TT1
+ eor vdig_A.16b,vTT1.16b,vmsg0.16b
+ // H=G
+ eor vdig_H.16b,vdig_G.16b,vmsg7.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ eor vdig_G.16b,vdig_G.16b,vmsg6.16b
+ //F = E
+ eor vdig_F.16b,vdig_E.16b,vmsg5.16b
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+ eor vdig_E.16b, vdig_E.16b, vmsg4.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+.endm
+
+ .set dig_off , 80
+
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x4
+ .type sm3_mb_asimd_x4, %function
+sm3_mb_asimd_x4:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ //push d8~d15
+ ldr job0_data, [job0],64
+ stp d8,d9, [sp,16]
+ ldr job1_data, [job1],64
+ stp d10,d11,[sp,32]
+ ldr job2_data, [job2],64
+ stp d12,d13,[sp,48]
+ ldr job3_data, [job3],64
+ stp d14,d15,[sp,64]
+ ble .exit_func
+
+ mov job0_tmp,job0_digest
+ mov job1_tmp,job1_digest
+ mov job2_tmp,job2_digest
+ mov job3_tmp,job3_digest
+ //load digests
+ ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16
+ adrp const_adr, .consts
+ ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16
+ add const_adr, const_adr, #:lo12:.consts
+ ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp]
+ rev32 vdig_A.16b,vdig_A.16b
+ ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp]
+ rev32 vdig_B.16b,vdig_B.16b
+ ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp]
+ rev32 vdig_C.16b,vdig_C.16b
+ ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp]
+ rev32 vdig_D.16b,vdig_D.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+
+.start_loop:
+ ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16
+ ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16
+ ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16
+ ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16
+ ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16
+ ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16
+ ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16
+ ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16
+ ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16
+ ldr qTj,[const_adr]
+
+ sm3_round_0 0, 4
+
+ ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16
+ sm3_round_0 1, 5
+
+ ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16
+ sm3_round_0 2, 6
+ ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16
+ sm3_round_0 3, 7
+
+ ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16
+
+ sm3_round_4 4, 8
+ ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16
+ sm3_round_4 5, 9
+ ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16
+ sm3_round_4 6,10
+ ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16
+ sm3_round_4 7,11
+ sm3_round_4 8,12
+ sm3_round_4 9,13
+ sm3_round_4 10,14
+ sm3_round_4 11,15
+
+ sm3_round_12 12,16, 0, 7,13, 3,10 //12
+ sm3_round_12 13, 0, 1, 8,14, 4,11 //13
+ sm3_round_12 14, 1, 2, 9,15, 5,12 //14
+ sm3_round_12 15, 2, 3,10,16, 6,13 //15
+
+ ldr qTj,[const_adr,16]
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //16
+#if 0
+ stp sdig_A,sdig_B,[job0_digest]
+ stp sdig_C,sdig_D,[job0_digest,8]
+ stp sdig_E,sdig_F,[job0_digest,16]
+ stp sdig_G,sdig_H,[job0_digest,24]
+ b .exit_func
+#endif
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //17
+
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //18
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //19
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //20
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //21
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //22
+ sm3_round_16 6,10,11, 1, 7,14, 4 //23
+ sm3_round_16 7,11,12, 2, 8,15, 5 //24
+ sm3_round_16 8,12,13, 3, 9,16, 6 //25
+ sm3_round_16 9,13,14, 4,10, 0, 7 //26
+ sm3_round_16 10,14,15, 5,11, 1, 8 //27
+ sm3_round_16 11,15,16, 6,12, 2, 9 //28
+ sm3_round_16 12,16, 0, 7,13, 3,10 //29
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //30
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //31
+ sm3_round_16 15, 2, 3,10,16, 6,13 //32
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //33
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //34
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //35
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //36
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //37
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //38
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //39
+ sm3_round_16 6,10,11, 1, 7,14, 4 //40
+ sm3_round_16 7,11,12, 2, 8,15, 5 //41
+ sm3_round_16 8,12,13, 3, 9,16, 6 //42
+ sm3_round_16 9,13,14, 4,10, 0, 7 //43
+ sm3_round_16 10,14,15, 5,11, 1, 8 //44
+ sm3_round_16 11,15,16, 6,12, 2, 9 //45
+ sm3_round_16 12,16, 0, 7,13, 3,10 //46
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //47
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //48
+ sm3_round_16 15, 2, 3,10,16, 6,13 //49
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //50
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //51
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //52
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //53
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //54
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //55
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //56
+ sm3_round_16 6,10,11, 1, 7,14, 4 //57
+ sm3_round_16 7,11,12, 2, 8,15, 5 //58
+ sm3_round_16 8,12,13, 3, 9,16, 6 //59
+ sm3_round_16 9,13,14, 4,10, 0, 7 //60
+ sm3_round_16 10,14,15, 5,11, 1, 8 //61
+ sm3_round_16 11,15,16, 6,12, 2, 9 //62
+ sm3_round_63 12,16, 0, 7,13, 3,10 //63
+
+ subs len,len,1
+ bne .start_loop
+
+ //save digests with big endian
+ rev32 vdig_A.16b,vdig_A.16b
+ rev32 vdig_B.16b,vdig_B.16b
+ rev32 vdig_C.16b,vdig_C.16b
+ rev32 vdig_D.16b,vdig_D.16b
+ st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16
+ st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16
+ st4 {vdig_E.s-vdig_H.s}[0],[job0_digest]
+ st4 {vdig_E.s-vdig_H.s}[1],[job1_digest]
+ st4 {vdig_E.s-vdig_H.s}[2],[job2_digest]
+ st4 {vdig_E.s-vdig_H.s}[3],[job3_digest]
+
+.exit_func:
+ ldp d8, d9, [sp,16]
+ ldp d10,d11,[sp,32]
+ ldp d12,d13,[sp,48]
+ ldp d14,d15,[sp,64]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+.consts:
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
new file mode 100644
index 000000000..6e1dff45e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
@@ -0,0 +1,246 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_asimd(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_asimd(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define cpu_to_be32(v) (((v&0xff000000)>>24) | ((v&0xff0000)>>8) | ((v&0xff00)<<8) | ((v&0xff)<<24))
+#else
+#define cpu_to_be32(v)
+#endif
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { cpu_to_be32(0x7380166f), cpu_to_be32(0x4914b2b9),
+ cpu_to_be32(0x172442d7), cpu_to_be32(0xda8a0600),
+ cpu_to_be32(0xa96f30bc), cpu_to_be32(0x163138aa),
+ cpu_to_be32(0xe38dee4d), cpu_to_be32(0xb0fb0e4e)
+ };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
new file mode 100644
index 000000000..5af9ead38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_sm(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_sm(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_sm(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_sm(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_sm(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { to_be32(0x7380166f), to_be32(0x4914b2b9),
+ to_be32(0x172442d7), to_be32(0xda8a0600),
+ to_be32(0xa96f30bc), to_be32(0x163138aa),
+ to_be32(0xe38dee4d), to_be32(0xb0fb0e4e)
+ };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
new file mode 100644
index 000000000..48a0d4d0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES 4
+void sm3_mb_asimd_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+void sm3_mb_asimd_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SM3_MB_CE_MAX_LANES is invalid lane
+ for (; i < SM3_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i;
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+ sm3_mb_asimd_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+ //only return the min length job
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+ } else {
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ len = state->lens[i] & (~0xf);
+ sm3_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4);
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ return i;
+ }
+ }
+ }
+ return -1;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+ int i;
+ SM3_JOB *ret = NULL;
+
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SM3_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SM3_JOB *ret;
+
+ //add job into lanes
+ sm3_mb_mgr_insert_job(state, job);
+
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sm3_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sm3_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sm3_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state)
+{
+ SM3_JOB *ret;
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sm3_mb_mgr_do_jobs(state);
+ return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
new file mode 100644
index 000000000..a7178e0be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES 4
+#if SM3_MB_CE_MAX_LANES >=4
+void sm3_mb_sm_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+void sm3_mb_sm_x3(SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+void sm3_mb_sm_x2(SM3_JOB *, SM3_JOB *, int);
+#endif
+void sm3_mb_sm_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SM3_MB_CE_MAX_LANES is invalid lane
+ for (; i < SM3_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SM3_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SM3_MB_CE_MAX_LANES == 4
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sm3_mb_sm_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+
+ } else
+#elif SM3_MB_CE_MAX_LANES == 3
+ if (state->num_lanes_inuse == 3) {
+ len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sm3_mb_sm_x3(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane, len >> 4);
+
+ } else
+#elif SM3_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+ sm3_mb_sm_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SM3_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+#if SM3_MB_CE_MAX_LANES >=4
+ if (lanes == 4) {
+ sm3_mb_sm_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+ if (lanes == 3) {
+ sm3_mb_sm_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sm3_mb_sm_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sm3_mb_sm_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+ int i;
+ SM3_JOB *ret = NULL;
+
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SM3_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SM3_JOB *ret;
+
+ //add job into lanes
+ sm3_mb_mgr_insert_job(state, job);
+
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sm3_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sm3_mb_mgr_do_jobs(state);
+#endif
+
+ ret = sm3_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state)
+{
+ SM3_JOB *ret;
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sm3_mb_mgr_do_jobs(state);
+ return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
new file mode 100644
index 000000000..836bd9ccc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
new file mode 100644
index 000000000..f92ac5e9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
@@ -0,0 +1,237 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ ext v\msg4\().16b, v\msg1\().16b, v\msg2\().16b, #12
+ ext v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b, #12
+ ext v\tmp1\().16b, v\msg2\().16b, v\msg3\().16b, #8
+ sm3partw1 v\msg4\().4s, v\msg0\().4s, v\msg3\().4s
+ sm3partw2 v\msg4\().4s, v\tmp1\().4s, v\tmp0\().4s
+
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+ eor v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b
+
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[0]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[0]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[1]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[1]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[2]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[2]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[3]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[3]
+
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+ job .req x0
+ len .req x1
+ data .req x2
+ digest .req x0
+ end_ptr .req x1
+
+
+ declare_var_vector_reg msg0,0
+ declare_var_vector_reg msg1,1
+ declare_var_vector_reg msg2,2
+ declare_var_vector_reg msg3,3
+ declare_var_vector_reg msg4,4
+ declare_var_vector_reg dig0,5
+ declare_var_vector_reg dig1,6
+ declare_var_vector_reg backup_dig0, 7
+
+ declare_var_vector_reg tmp0,16
+ declare_var_vector_reg tmp1,17
+ declare_var_vector_reg backup_dig1, 18
+
+ declare_var_vector_reg const0,19
+ declare_var_vector_reg const1,20
+ declare_var_vector_reg const2,21
+ declare_var_vector_reg const3,22
+ declare_var_vector_reg const4,23
+ declare_var_vector_reg const5,24
+ declare_var_vector_reg const6,25
+ declare_var_vector_reg const7,26
+ declare_var_vector_reg const8,27
+ declare_var_vector_reg const9,28
+ declare_var_vector_reg const10,29
+ declare_var_vector_reg const11,30
+
+
+
+
+ .global sm3_mb_sm_x1
+ .type sm3_mb_sm_x1, %function
+sm3_mb_sm_x1:
+ adrp x3,.consts
+ ldr data, [job],64
+ add x3,x3,:lo12:.consts
+ ldp qdig0,qdig1,[digest]
+ ld1 {vconst0.16b-vconst3.16b},[x3],64
+ add end_ptr,data,len,lsl 6
+ ld1 {vconst4.16b-vconst7.16b},[x3],64
+ //rev128
+ ext vdig0.16b,vdig0.16b,vdig0.16b,#8
+ ext vdig1.16b,vdig1.16b,vdig1.16b,#8
+ ld1 {vconst8.16b-vconst11.16b},[x3],64
+ rev64 vdig0.16b,vdig0.16b
+ rev64 vdig1.16b,vdig1.16b
+
+
+start_loop:
+ mov vbackup_dig0.16b,vdig0.16b
+ mov vbackup_dig1.16b,vdig1.16b
+ ldp qmsg0,qmsg1,[data],32
+ ldp qmsg2,qmsg3,[data],32
+
+ // big-endian to little-endian
+ rev32 vmsg0.16b,vmsg0.16b
+ rev32 vmsg1.16b,vmsg1.16b
+ rev32 vmsg2.16b,vmsg2.16b
+ rev32 vmsg3.16b,vmsg3.16b
+
+ quad_round_expand a, const0, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand a, const1, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand a, const2, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const4, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const6, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand b, const8, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const4, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+ cmp data,end_ptr
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+ eor vdig0.16b,vdig0.16b,vbackup_dig0.16b
+ eor vdig1.16b,vdig1.16b,vbackup_dig1.16b
+
+
+ bcc start_loop
+
+ //rev128
+ ext vdig0.16b,vdig0.16b,vdig0.16b,#8
+ ext vdig1.16b,vdig1.16b,vdig1.16b,#8
+ rev64 vdig0.16b,vdig0.16b
+ rev64 vdig1.16b,vdig1.16b
+ str qdig0,[digest]
+ str qdig1,[digest,16]
+ ret
+ dsb ish
+ isb
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x1, .-sm3_mb_sm_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
new file mode 100644
index 000000000..4e4a6e738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
@@ -0,0 +1,344 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 v\job\()_\msg4\().4s, v\job\()_\msg0\().4s, v\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 v\job\()_\msg4\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1
+ do_ext job\j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1
+ do_ext job\j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1
+ do_ext job\j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1
+ do_sm3partw1 job\j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1
+ do_sm3partw2 job\j,\msg4, \tmp1, \tmp0
+ .endr
+
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+ .irp j,0,1
+ do_eor job\j,\tmp0,\msg0,\msg1
+ .endr
+ .irp lane,0,1,2,3
+ .irp j,0,1
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+ .irp j,0,1
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+ .endr
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+ Variables
+*/
+ job0 .req x0
+ job1 .req x1
+ len .req x2
+
+ job0_data .req x3
+ job1_data .req x4
+ job0_digest .req x0
+ job1_digest .req x1
+
+ const_adr .req x5
+ end_ptr .req x2
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job0_msg1, 1
+ declare_var_vector_reg job0_msg2, 2
+ declare_var_vector_reg job0_msg3, 3
+ declare_var_vector_reg job0_msg4, 4
+ declare_var_vector_reg job0_dig0, 5
+ declare_var_vector_reg job0_dig1, 6
+ declare_var_vector_reg job0_tmp0, 7
+ declare_var_vector_reg job0_tmp1, 8
+ declare_var_vector_reg job0_backup_dig0, 9
+ declare_var_vector_reg job0_backup_dig1, 10
+
+ declare_var_vector_reg job1_msg0, 11
+ declare_var_vector_reg job1_msg1, 12
+ declare_var_vector_reg job1_msg2, 13
+ declare_var_vector_reg job1_msg3, 14
+ declare_var_vector_reg job1_msg4, 15
+ declare_var_vector_reg job1_dig0, 16
+ declare_var_vector_reg job1_dig1, 17
+ declare_var_vector_reg job1_tmp0, 18
+ declare_var_vector_reg job1_tmp1, 19
+ declare_var_vector_reg job1_backup_dig0, 20
+ declare_var_vector_reg job1_backup_dig1, 21
+
+ declare_var_vector_reg const0, 22
+ declare_var_vector_reg const1, 23
+ declare_var_vector_reg const2, 24
+ declare_var_vector_reg const3, 25
+ declare_var_vector_reg const4, 26
+ declare_var_vector_reg const5, 27
+ declare_var_vector_reg const6, 28
+ declare_var_vector_reg const7, 29
+ declare_var_vector_reg const8, 30
+ declare_var_vector_reg const9, 31
+ declare_var_vector_reg const10, 22
+ declare_var_vector_reg const11, 23
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+.macro rev32_msgs
+ .irp j,0,1
+ do_rev32_job job\j
+ .endr
+.endm
+
+
+ .global sm3_mb_sm_x2
+ .type sm3_mb_sm_x2, %function
+sm3_mb_sm_x2:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+ adrp const_adr,.consts
+ ldr job0_data, [job0],64
+ add const_adr,const_adr,:lo12:.consts
+ ldr job1_data, [job1],64
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+
+ ldp qconst2,qconst3,[const_adr,32]
+ ldp qconst4,qconst5,[const_adr,64]
+ ldp qconst6,qconst7,[const_adr,96]
+ ldp qconst8,qconst9,[const_adr,128]
+ add end_ptr,job0_data,len,lsl 6
+
+ //rev128
+ ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+ ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+ rev64 vjob0_dig0.16b,vjob0_dig0.16b
+ rev64 vjob0_dig1.16b,vjob0_dig1.16b
+ ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+ ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+ rev64 vjob1_dig0.16b,vjob1_dig0.16b
+ rev64 vjob1_dig1.16b,vjob1_dig1.16b
+
+
+
+
+
+start_loop:
+
+ ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+ ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+
+ mov vjob0_backup_dig0.16b,vjob0_dig0.16b
+ mov vjob0_backup_dig1.16b,vjob0_dig1.16b
+ mov vjob1_backup_dig0.16b,vjob1_dig0.16b
+ mov vjob1_backup_dig1.16b,vjob1_dig1.16b
+
+ // const10,const11,const0,const1 share registers
+ ldp qconst0,qconst1,[const_adr]
+
+ // big-endian to little-endian
+ rev32_msgs
+
+ cmp job0_data,end_ptr
+ quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+
+
+ quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ // const10,const11,const0,const1 share registers
+ ldp qconst10,qconst11,[const_adr,160]
+ quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+ eor vjob0_dig0.16b,vjob0_dig0.16b,vjob0_backup_dig0.16b
+ eor vjob0_dig1.16b,vjob0_dig1.16b,vjob0_backup_dig1.16b
+ eor vjob1_dig0.16b,vjob1_dig0.16b,vjob1_backup_dig0.16b
+ eor vjob1_dig1.16b,vjob1_dig1.16b,vjob1_backup_dig1.16b
+
+
+ bcc start_loop
+
+ //rev128
+ ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+ ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+ rev64 vjob0_dig0.16b,vjob0_dig0.16b
+ rev64 vjob0_dig1.16b,vjob0_dig1.16b
+ stp qjob0_dig0,qjob0_dig1,[job0_digest]
+
+ ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+ ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+ rev64 vjob1_dig0.16b,vjob1_dig0.16b
+ rev64 vjob1_dig1.16b,vjob1_dig1.16b
+ stp qjob1_dig0,qjob1_dig1,[job1_digest]
+
+#if 1
+ mov v0.16b,vjob1_dig0.16b
+ mov v1.16b,vjob1_dig1.16b
+ b exit_ret
+#endif
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x2, .-sm3_mb_sm_x2
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
new file mode 100644
index 000000000..58758f98d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
@@ -0,0 +1,368 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ //dsdf
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1,2
+ do_ext \j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1,2
+ do_ext \j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1,2
+ do_ext \j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1,2
+ do_sm3partw1 \j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1,2
+ do_sm3partw2 \j,\msg4, \tmp1, \tmp0
+ .endr
+
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest job
+ ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro do_st_digest job
+ stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,load_digest
+ .irp j,0,1,2
+ do_eor job\j,\tmp0,\msg0,\msg1
+ .ifnb \load_digest
+ do_ld_backup_digest \j
+ .endif
+ .endr
+ .irp lane,0,1,2,3
+ .irp j,0,1,2
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1,2
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+ .irp j,0,1,2
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+
+ .endr
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+ Variables
+*/
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ len .req x3
+
+ job0_data .req x4
+ job1_data .req x5
+ job2_data .req x6
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+
+ const_adr .req x7
+ end_ptr .req x3
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job0_msg1, 1
+ declare_var_vector_reg job0_msg2, 2
+ declare_var_vector_reg job0_msg3, 3
+ declare_var_vector_reg job0_msg4, 4
+ declare_var_vector_reg job0_dig0, 5
+ declare_var_vector_reg job0_dig1, 6
+ declare_var_vector_reg job0_tmp0, 7
+ declare_var_vector_reg job0_tmp1, 8
+ .set job0_dig_off, 64
+ declare_var_vector_reg job0_backup_dig0, 2
+ declare_var_vector_reg job0_backup_dig1, 3
+
+ declare_var_vector_reg job1_msg0, 9
+ declare_var_vector_reg job1_msg1, 10
+ declare_var_vector_reg job1_msg2, 11
+ declare_var_vector_reg job1_msg3, 12
+ declare_var_vector_reg job1_msg4, 13
+ declare_var_vector_reg job1_dig0, 14
+ declare_var_vector_reg job1_dig1, 15
+ declare_var_vector_reg job1_tmp0, 16
+ declare_var_vector_reg job1_tmp1, 17
+ .set job1_dig_off, 96
+ declare_var_vector_reg job1_backup_dig0, 11
+ declare_var_vector_reg job1_backup_dig1, 12
+
+ declare_var_vector_reg job2_msg0, 18
+ declare_var_vector_reg job2_msg1, 19
+ declare_var_vector_reg job2_msg2, 20
+ declare_var_vector_reg job2_msg3, 21
+ declare_var_vector_reg job2_msg4, 22
+ declare_var_vector_reg job2_dig0, 23
+ declare_var_vector_reg job2_dig1, 24
+ declare_var_vector_reg job2_tmp0, 25
+ declare_var_vector_reg job2_tmp1, 26
+ .set job2_dig_off, 128
+ declare_var_vector_reg job2_backup_dig0, 20
+ declare_var_vector_reg job2_backup_dig1, 21
+
+
+ declare_var_vector_reg const0, 27
+ declare_var_vector_reg const1, 28
+ declare_var_vector_reg const2, 29
+ declare_var_vector_reg const3, 30
+ declare_var_vector_reg const4, 27
+ declare_var_vector_reg const5, 28
+ declare_var_vector_reg const6, 29
+ declare_var_vector_reg const7, 30
+ declare_var_vector_reg const8, 27
+ declare_var_vector_reg const9, 28
+ declare_var_vector_reg const10, 29
+ declare_var_vector_reg const11, 30
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+.macro rev32_msgs
+ .irp j,0,1,2
+ do_rev32_job job\j
+ .endr
+.endm
+
+.macro do_rev64 job,regd,regn
+ rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+ .global sm3_mb_sm_x3
+ .type sm3_mb_sm_x3, %function
+sm3_mb_sm_x3:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+ adrp const_adr,.consts
+ ldr job0_data, [job0],64
+ add const_adr,const_adr,:lo12:.consts
+ ldr job1_data, [job1],64
+ ldr job2_data, [job2],64
+
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+ ldp qjob2_dig0,qjob2_dig1,[job2_digest]
+ ld1 {vconst0.16b-vconst3.16b},[const_adr]
+ add end_ptr,job0_data,len,lsl 6
+
+ //rev128
+ .irp j,0,1,2
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ .endr
+
+
+
+
+
+start_loop:
+
+ ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+ stp qjob0_dig0,qjob0_dig1,[sp,job0_dig_off]
+ ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+ stp qjob1_dig0,qjob1_dig1,[sp,job1_dig_off]
+ ld1 {vjob2_msg0.16b-vjob2_msg3.16b},[job2_data],64
+ stp qjob2_dig0,qjob2_dig1,[sp,job2_dig_off]
+
+ cmp job0_data,end_ptr
+
+ // big-endian to little-endian
+ rev32_msgs
+
+ quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+
+ ldp qconst4,qconst5,[const_adr,4*16]
+ quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+
+ ldp qconst6,qconst7,[const_adr,6*16]
+ quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ ldp qconst8,qconst9,[const_adr,8*16]
+ quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ ldp qconst10,qconst11,[const_adr,10*16]
+ quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+
+ ldp qconst4,qconst5,[const_adr,4*16]
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ ldp qconst6,qconst7,[const_adr,6*16]
+ quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+ ldp qconst0,qconst1,[const_adr]
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1,1
+ ldp qconst2,qconst3,[const_adr,2*16]
+
+ .irp j,0,1,2
+ do_eor job\j,dig0,dig0,backup_dig0
+ do_eor job\j,dig1,dig1,backup_dig1
+ .endr
+
+ bcc start_loop
+
+ //rev128
+ .irp j,0,1,2
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ do_st_digest \j
+ .endr
+
+
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x3, .-sm3_mb_sm_x3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
new file mode 100644
index 000000000..7f3f1db66
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
@@ -0,0 +1,440 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1,2,3
+ do_ext \j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1,2,3
+ do_ext \j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1,2,3
+ do_ext \j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1,2,3
+ do_sm3partw1 \j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1,2,3
+ do_sm3partw2 \j,\msg4, \tmp1, \tmp0
+ .endr
+ st1 {vjob0_\msg4\().16b-vjob3_\msg4\().16b},[data_buf],64
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest job
+ ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro do_st_digest job
+ stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,is_last
+ .ifnb \is_last
+ ld1 {vjob0_backup_dig0.16b-vjob3_backup_dig0.16b},[dig_buf],64
+ .endif
+
+ .irp j,0,1,2,3
+ do_eor job\j,\tmp0,\msg0,\msg1
+
+ .endr
+
+ .irp lane,0,1,2
+ .irp j,0,1,2,3
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1,2,3
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+ .irp j,0,1,2,3
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+
+
+ .endr
+ .irp j,0,1,2,3
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+ .ifnb \is_last
+
+ ld1 {vjob0_backup_dig1.16b-vjob3_backup_dig1.16b},[dig_buf]
+ .else
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .endif
+ .irp j,0,1,2,3
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,3
+ .endr
+
+ .irp j,0,1,2,3
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,3
+ .ifnb \is_last
+ do_eor job\j,dig1,dig1,backup_dig1
+ do_eor job\j,dig0,dig0,backup_dig0
+ .endif
+ .endr
+
+ .ifb \is_last
+ ld1 {vjob0_\msg0\().16b-vjob3_\msg0\().16b},[data_buf],64
+ .endif
+
+.endm
+
+
+
+/*
+ Variables
+*/
+ .set temp_buf_size,(68*4+32)*4
+ .set dig_buf_off,64
+ .set data_buf_off,64+32*4
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ len .req x4
+
+ job0_data .req x5
+ job1_data .req x6
+ job2_data .req x7
+ job3_data .req x9
+
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+ job3_digest .req x3
+
+ const_adr .req x10
+ end_ptr .req x4
+ data_buf .req x11
+ dig_buf .req x12
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job1_msg0, 1
+ declare_var_vector_reg job2_msg0, 2
+ declare_var_vector_reg job3_msg0, 3
+ declare_var_vector_reg job0_msg1, 4
+ declare_var_vector_reg job1_msg1, 5
+ declare_var_vector_reg job2_msg1, 6
+ declare_var_vector_reg job3_msg1, 7
+ declare_var_vector_reg job0_msg2, 8
+ declare_var_vector_reg job1_msg2, 9
+ declare_var_vector_reg job2_msg2, 10
+ declare_var_vector_reg job3_msg2, 11
+ declare_var_vector_reg job0_msg3, 12
+ declare_var_vector_reg job1_msg3, 13
+ declare_var_vector_reg job2_msg3, 14
+ declare_var_vector_reg job3_msg3, 15
+ declare_var_vector_reg job0_tmp0, 16
+ declare_var_vector_reg job1_tmp0, 17
+ declare_var_vector_reg job2_tmp0, 18
+ declare_var_vector_reg job3_tmp0, 19
+ declare_var_vector_reg job0_tmp1, 20
+ declare_var_vector_reg job1_tmp1, 21
+ declare_var_vector_reg job2_tmp1, 22
+ declare_var_vector_reg job3_tmp1, 23
+ declare_var_vector_reg job0_msg4, 24
+ declare_var_vector_reg job1_msg4, 25
+ declare_var_vector_reg job2_msg4, 26
+ declare_var_vector_reg job3_msg4, 27
+ declare_var_vector_reg job0_dig0, 8
+ declare_var_vector_reg job1_dig0, 9
+ declare_var_vector_reg job2_dig0, 10
+ declare_var_vector_reg job3_dig0, 11
+ declare_var_vector_reg job0_dig1, 12
+ declare_var_vector_reg job1_dig1, 13
+ declare_var_vector_reg job2_dig1, 14
+ declare_var_vector_reg job3_dig1, 15
+
+ declare_var_vector_reg job0_backup_dig0, 24
+ declare_var_vector_reg job1_backup_dig0, 25
+ declare_var_vector_reg job2_backup_dig0, 26
+ declare_var_vector_reg job3_backup_dig0, 27
+ declare_var_vector_reg job0_backup_dig1, 28
+ declare_var_vector_reg job1_backup_dig1, 29
+ declare_var_vector_reg job2_backup_dig1, 30
+ declare_var_vector_reg job3_backup_dig1, 31
+
+ declare_var_vector_reg const0, 24
+ declare_var_vector_reg const1, 25
+ declare_var_vector_reg const2, 26
+ declare_var_vector_reg const3, 27
+ declare_var_vector_reg const4, 28
+ declare_var_vector_reg const5, 29
+ declare_var_vector_reg const6, 30
+ declare_var_vector_reg const7, 31
+ declare_var_vector_reg const8, 24
+ declare_var_vector_reg const9, 25
+ declare_var_vector_reg const10, 26
+ declare_var_vector_reg const11, 27
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+
+.macro rev32_msgs
+ .irp j,0,1,2,3
+ do_rev32_job job\j
+ .endr
+.endm
+
+.macro do_rev64 job,regd,regn
+ rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+.macro do_ldp_msg23 job
+ ldp qjob\job\()_msg2,qjob\job\()_msg3,[job\job\()_data],32
+.endm
+
+ .global sm3_mb_sm_x4
+ .type sm3_mb_sm_x4, %function
+sm3_mb_sm_x4:
+ //push d8~d15
+ sub sp,sp,temp_buf_size
+ stp d8,d9,[sp,-64]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+
+ ldr job0_data, [job0],64
+ ldr job1_data, [job1],64
+ ldr job2_data, [job2],64
+ ldr job3_data, [job3],64
+
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+ ldp qjob2_dig0,qjob2_dig1,[job2_digest]
+ ldp qjob3_dig0,qjob3_dig1,[job3_digest]
+ add end_ptr,job0_data,len,lsl 6
+ //rev128,change digest endian
+ .irp j,0,1,2,3
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ .endr
+
+
+
+
+start_loop:
+ add dig_buf,sp,dig_buf_off
+ ldp qjob0_msg0,qjob0_msg1,[job0_data],32
+ add data_buf,sp,data_buf_off
+ ldp qjob1_msg0,qjob1_msg1,[job1_data],32
+ st1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+ ldp qjob2_msg0,qjob2_msg1,[job2_data],32
+ st1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+ ldp qjob3_msg0,qjob3_msg1,[job3_data],32
+
+ .irp j,0,1,2,3
+ do_ldp_msg23 \j
+ do_rev32_msg job\j,msg0
+ do_rev32_msg job\j,msg1
+ .endr
+ st1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+ st1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+ .irp j,0,1,2,3
+ do_rev32_msg job\j,msg2
+ do_rev32_msg job\j,msg3
+ .endr
+ st1 {vjob0_msg2.16b-vjob3_msg2.16b},[data_buf],64
+ st1 {vjob0_msg3.16b-vjob3_msg3.16b},[data_buf],64
+
+ cmp job0_data,end_ptr
+
+ /** message expand **/
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+ /** re-init variables for sm3 rounds **/
+ add dig_buf,sp,dig_buf_off
+ ld1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+ add data_buf,sp,data_buf_off
+ ld1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+ add dig_buf,sp,dig_buf_off
+ adrp const_adr,.consts
+ ld1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+ add const_adr,const_adr,:lo12:.consts
+ ld1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+ ld1 {vconst0.16b-vconst3.16b},[const_adr],64
+ ld1 {vconst4.16b-vconst7.16b},[const_adr],64
+ /** digests rounds **/
+ quad_round a, const0 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round a, const1 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round a, const2 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round a, const3 , dig0, dig1, msg1, msg0, tmp0, tmp1
+
+ /** share registers with vconst0-vconst3 **/
+ ld1 {vconst8.16b-vconst11.16b},[const_adr]
+
+ quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const8 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const9 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const10, dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const11, dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1,1
+
+ bcc start_loop
+
+ //rev128
+ .irp j,0,1,2,3
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ do_st_digest \j
+ .endr
+
+
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 64
+ add sp,sp,temp_buf_size
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x4, .-sm3_mb_sm_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
new file mode 100644
index 000000000..b1c6ee26b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
@@ -0,0 +1,284 @@
+/**********************************************************************
+ Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SM3_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
+
+void sm3_ctx_mgr_init_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx2(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ unsigned int j;
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ ctx->job.result_digest[j] =
+ byteswap32(ctx->job.result_digest[j]);
+ }
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx2_slver_0000;
+struct slver sm3_ctx_mgr_init_avx2_slver = { 0x2309, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx2_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx2_slver = { 0x230a, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx2_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx2_slver = { 0x230b, 0x00, 0x00 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
new file mode 100644
index 000000000..8169aa170
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
@@ -0,0 +1,292 @@
+/**********************************************************************
+ Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx512(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SM3_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
+
+void sm3_ctx_mgr_init_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx512(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // if partial_block_buffer_length != 0 means ctx get extra data
+ // len < SM3_BLOCK_SIZE means data len < SM3_BLOCK_SIZE
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+
+ ctx->partial_block_buffer_length = 0;
+ ctx->job.buffer = ctx->partial_block_buffer;
+
+ ctx->job.len = 1;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ unsigned int j;
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ ctx->job.result_digest[j] =
+ byteswap32(ctx->job.result_digest[j]);
+ }
+ return ctx;
+ }
+ // partial_block_buffer_length must be 0 that means incoming_buffer_length have not be init.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // copy_len will check len % SM3_BLOCK_SIZE ?= 0
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ // if mod SM3_BLOCK_SIZE != 0
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ // store the extra data
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+ // after len -= copy_len or copy_len == 0
+ assert((len % SM3_BLOCK_SIZE) == 0);
+ // get the block size , eq len = len / 64
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ // todo make sure should return ?
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx512_slver_0000;
+struct slver sm3_ctx_mgr_init_avx512_slver = { 0x2306, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx512_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx512_slver = { 0x2307, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx512_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx512_slver = { 0x2308, 0x00, 0x00 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
new file mode 100644
index 000000000..e8fcfe08a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
@@ -0,0 +1,314 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define rol32(x, r) (((x)<<(r)) | ((x)>>(32-(r))))
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t OPT_FIX sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void OPT_FIX sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sm3_single(const volatile void *data, uint32_t digest[]);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+
+static inline uint32_t P0(uint32_t X)
+{
+ return (X ^ (rol32(X, 9)) ^ (rol32(X, 17)));
+}
+
+static inline uint32_t P1(uint32_t X)
+{
+ return (X ^ (rol32(X, 15)) ^ (rol32(X, 23)));
+}
+
+static inline uint32_t sm3_ff(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+ return j < 16 ? (x ^ y ^ z) : ((x & y) | (x & z) | (y & z));
+}
+
+static inline uint32_t sm3_gg(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+ return j < 16 ? (x ^ y ^ z) : ((x & y) | ((~x) & z));
+}
+
+static inline void sm3_message_schedule(uint32_t bi[], volatile uint32_t W[],
+ volatile uint32_t W_B[])
+{
+ int j;
+ volatile uint32_t tmp;
+
+ for (j = 0; j <= 15; j++) {
+ W[j] = to_be32(bi[j]);
+ }
+
+ for (; j <= 67; j++) {
+ tmp = W[j - 16] ^ W[j - 9] ^ rol32(W[j - 3], 15);
+ W[j] = P1(tmp) ^ (rol32(W[j - 13], 7)) ^ W[j - 6];
+ }
+
+ for (j = 0; j < 64; j++) {
+ W_B[j] = W[j] ^ W[j + 4];
+ }
+
+ tmp = 0;
+}
+
+static inline void sm3_compress_step_func(int j, volatile uint32_t * a_p,
+ volatile uint32_t * b_p, volatile uint32_t * c_p,
+ volatile uint32_t * d_p, volatile uint32_t * e_p,
+ volatile uint32_t * f_p, volatile uint32_t * g_p,
+ volatile uint32_t * h_p, volatile uint32_t W[],
+ volatile uint32_t W_B[])
+{
+ volatile uint32_t SS1, SS2, TT1, TT2;
+ uint32_t T = j < 16 ? 0x79cc4519 : 0x7a879d8a;
+
+ SS1 = rol32(rol32(*a_p, 12) + *e_p + rol32(T, (j % 32)), 7);
+ SS2 = SS1 ^ rol32(*a_p, 12);
+ TT1 = sm3_ff(j, *a_p, *b_p, *c_p) + *d_p + SS2 + W_B[j];
+ TT2 = sm3_gg(j, *e_p, *f_p, *g_p) + *h_p + SS1 + W[j];
+ *d_p = *c_p;
+ *c_p = rol32(*b_p, 9);
+ *b_p = *a_p;
+ *a_p = TT1;
+ *h_p = *g_p;
+ *g_p = rol32(*f_p, 19);
+ *f_p = *e_p;
+ *e_p = P0(TT2);
+
+ SS1 = 0;
+ SS2 = 0;
+ TT1 = 0;
+ TT2 = 0;
+}
+
+void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr)
+{
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+ if (len % SM3_BLOCK_SIZE != 0) {
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+ sm3_init(ctx, buffer, len);
+ sm3_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ if (len % SM3_BLOCK_SIZE != 0) {
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+ sm3_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sm3_update(ctx, buffer, len);
+ sm3_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sm3_init(ctx, buffer, len);
+ remain_len = sm3_update(ctx, buffer, len);
+ sm3_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SM3_BLOCK_SIZE) {
+ sm3_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SM3_BLOCK_SIZE);
+ remain_len -= SM3_BLOCK_SIZE;
+ ctx->total_length += SM3_BLOCK_SIZE;
+ }
+
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len;
+ uint32_t j;
+ volatile uint8_t buf[2 * SM3_BLOCK_SIZE] = { 0 };
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy((void *)buf, buffer, i);
+ buf[i++] = 0x80;
+
+ i = (i > SM3_BLOCK_SIZE - SM3_PADLENGTHFIELD_SIZE ?
+ 2 * SM3_BLOCK_SIZE : SM3_BLOCK_SIZE);
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sm3_single(buf, digest);
+ if (i == 2 * SM3_BLOCK_SIZE) {
+ sm3_single(buf + SM3_BLOCK_SIZE, digest);
+ }
+
+ /* convert to small-endian for words */
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ digest[j] = byteswap32(digest[j]);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ memset((void *)buf, 0, sizeof(buf));
+}
+
+static void sm3_single(const volatile void *data, uint32_t digest[])
+{
+ volatile uint32_t a, b, c, d, e, f, g, h;
+ volatile uint32_t W[68], W_bar[64];
+ int j;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ sm3_message_schedule((uint32_t *) data, W, W_bar);
+ for (j = 0; j < 64; j++) {
+ sm3_compress_step_func(j, &a, &b, &c, &d, &e, &f, &g, &h, W, W_bar);
+ }
+
+ digest[0] ^= a;
+ digest[1] ^= b;
+ digest[2] ^= c;
+ digest[3] ^= d;
+ digest[4] ^= e;
+ digest[5] ^= f;
+ digest[6] ^= g;
+ digest[7] ^= h;
+
+ memset((void *)W, 0, sizeof(W));
+ memset((void *)W_bar, 0, sizeof(W_bar));
+
+ a = 0;
+ b = 0;
+ c = 0;
+ d = 0;
+ e = 0;
+ f = 0;
+ g = 0;
+ h = 0;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sm3_ctx_mgr_init_base_slver_0000;
+struct slver sm3_ctx_mgr_init_base_slver = { 0x2303, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_base_slver_0000;
+struct slver sm3_ctx_mgr_submit_base_slver = { 0x2304, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_base_slver_0000;
+struct slver sm3_ctx_mgr_flush_base_slver = { 0x2305, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
new file mode 100644
index 000000000..d74a4c882
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+
+extern void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr);
+extern SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+extern SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr);
+
+void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr)
+{
+ return sm3_ctx_mgr_init_base(mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sm3_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr)
+{
+ return sm3_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
new file mode 100644
index 000000000..0f2a0f39a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SM3_SB_THRESHOLD_SSE 1
+%define SM3_SB_THRESHOLD_AVX 1
+%define SM3_SB_THRESHOLD_AVX2 1
+%define SM3_SB_THRESHOLD_AVX512 1
+%define SM3_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb
+%define SM3_NI_SB_THRESHOLD_AVX512 6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SM3_JOB_size _FIELD_OFFSET
+%assign _SM3_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
new file mode 100644
index 000000000..fbbb2a1a7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SM3_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SM3_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SM3_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sm3_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SM3_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sm3_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sm3_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf("Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..a2319ba14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SM3 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SM3_ARGS_X16
+;;; name size align
+FIELD _digest, 4*8*16, 4 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SM3_ARGS_X4_size _FIELD_OFFSET
+%assign _SM3_ARGS_X4_align _STRUCT_ALIGN
+%assign _SM3_ARGS_X8_size _FIELD_OFFSET
+%assign _SM3_ARGS_X8_align _STRUCT_ALIGN
+%assign _SM3_ARGS_X16_size _FIELD_OFFSET
+%assign _SM3_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SM3_ARGS_X4_size, _SM3_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..b87bdcba8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sm3_mb_x8_avx2
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SM3_JOB* sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx2, function
+sm3_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..7feada49f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
@@ -0,0 +1,276 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sm3_mb_x16_avx512
+;extern sm3_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1 rdi ; rcx
+ %define arg2 rsi ; rdx
+ %define tmp4 rdx
+%else
+ %define arg1 rcx
+ %define arg2 rdx
+ %define tmp4 rsi
+%endif
+
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+
+; SM3_JOB* sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx512, function
+sm3_mb_mgr_flush_avx512:
+ endbranch
+
+ ; Save the stack
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; flush may check here and call x1
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x16_avx512
+ ; state and idx are intact
+
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+
+; return back stack
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_flush_avx512
+no_sm3_mb_mgr_flush_avx512:
+%endif
+
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..ae95faa89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SM3_JOB* sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR *state, SM3_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sm3_mb_mgr_submit_avx2, function
+sm3_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..7b7b21287
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;
+; SM3_JOB* sm3_mb_mgr_submit_avx512 (SM3_MB_JOB_MGR *state, SM3_JOB* job);
+;
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+;todo sm3_mb_x16_avx512
+extern sm3_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define arg1 rdi ; state
+%define arg2 rsi ; job
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx ; state
+%define arg2 rdx ; job
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2 ;
+%define len2 arg2 ; + offset
+%define p2 arg2 ; need + offset
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+%define num_lanes_inuse r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+; todo make sure
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+mk_global sm3_mb_mgr_submit_avx512, function
+sm3_mb_mgr_submit_avx512:
+ endbranch
+
+ ; save these registers
+ sub rsp, STACK_SPACE
+ ; rsp contain stack ptr , mov to stack bottom
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp ; unuse 1 2
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+ ;mov rbx,rbp,r12,r13,r14,r15 to stack
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ ; mov args to rbx and then mov rbx to rbp
+ ; unused_lanes - rbx , lane - rbp both have already backup
+ and lane, 0xF
+ ; unless lane is 0x789abcdef, and return 0
+
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ ; eq jump
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+; restore stack
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_submit_avx512
+no_sm3_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
new file mode 100644
index 000000000..b904ba0ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ sm3_ossl(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
new file mode 100644
index 000000000..3671a3b79
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // Run sm3_mb test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
new file mode 100644
index 000000000..64e583ffc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
@@ -0,0 +1,298 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SM3_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SM3_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sm3_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+ }
+
+ // Run sb_sm3 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sm3_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sm3_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sm3_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Run sm3_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SM3_BLOCK_SIZE +
+ SM3_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SM3_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sm3_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sm3_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SM3_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sm3_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
new file mode 100644
index 000000000..c409530c7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+
+typedef struct {
+ const char *msg;
+ uint32_t resultDigest[SM3_DIGEST_NWORDS];
+} TestData;
+
+static TestData test_data[] = {
+ {
+ .msg = "abc",
+ .resultDigest = {0xf4f0c766, 0xd9edee62, 0x6bd4f2d1, 0xe2e410dc,
+ 0x87c46741, 0xa2f7f25c, 0x2ba07d29, 0xe0a84b8f}
+ },
+ {
+ .msg = "abcdabcdabcdabcdabcdabcdabcdabcd" "abcdabcdabcdabcdabcdabcdabcdabcd",
+ .resultDigest = {0xf99fbede, 0xa1b87522, 0x89486038, 0x4d5a8ec1,
+ 0xe570db6f, 0x65577e38, 0xa3cb3d29, 0x32570c9c}
+
+ },
+ {
+ .msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+ .resultDigest = {0xc56c9b63, 0x379e4de6, 0x92b190a3, 0xeaa14fdf,
+ 0x74ab2007, 0xb992f67f, 0x664e8cf3, 0x058c7bad}
+ },
+
+ {.msg = "0123456789:;<=>?@ABCDEFGHIJKLMNO",
+ .resultDigest = {0x076833d0, 0xd089ec39, 0xad857685, 0x8089797a,
+ 0x9df9e8fd, 0x4126eb9a, 0xf38c22e8, 0x054bb846}},
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<",
+ .resultDigest = {0x6cb9d38e, 0x846ac99e, 0x6d05634b, 0x3fe1bb26,
+ 0x90368c4b, 0xee8c4299, 0x08c0e96a, 0x2233cdc7}
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR",
+ .resultDigest = {0x83758189, 0x050f14d1, 0x91d8a730, 0x4a2825e4,
+ 0x11723273, 0x2114ee3f, 0x18cac172, 0xa9c5b07a}
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?",
+ .resultDigest = {0xb80f8aba, 0x55e96119, 0x851ac77b, 0xae31b3a5,
+ 0x1333e764, 0xc86ac40d, 0x34878db1, 0x7da873f6},
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU",
+ .resultDigest = {0xbd5736a7, 0x55977d13, 0xa950c78a, 0x71eeb7cb,
+ 0xe9ef0ba5, 0x95a9302e, 0x155e5c33, 0xad96ce3c}
+ },
+ {
+ .msg = "",
+ .resultDigest = {0x831db21a, 0x7fa1cf55, 0x4819618e, 0x8f1ae831,
+ 0xc7c8be22, 0x74fbfe28, 0xeb35d07e, 0x2baa8250}
+
+ },
+
+};
+
+#define MSGS sizeof(test_data)/sizeof(TestData)
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+int main(void)
+{
+
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], test_data[i].msg,
+ strlen((char *)test_data[i].msg), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = test_data[t].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = test_data[t].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ test_data[j].msg, strlen((char *)test_data[j].msg),
+ HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = test_data[k].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = test_data[k].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sm3 test: Pass\n");
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..ed4d9a092
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sm3_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sm3" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..025fd90ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,133 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SM3_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sm3_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sm3" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
new file mode 100644
index 000000000..3b300fa80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
@@ -0,0 +1,1035 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr) ; rdi + 8*16
+%define DIGEST state ; rdi
+%define SIZE num_blks ; rsi
+
+%define IDX var1
+%define TBL var2
+
+%define APPEND(a,b) a %+ b
+
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+
+;
+; 4 ZMM for tmp data
+;
+%define TMP0 zmm8
+%define TMP1 zmm9
+%define TMP2 zmm10
+%define TMP3 zmm11
+
+;
+; Word W[] will be expand to array size 64
+; Word WB[] will be expand to array size 68
+; WB[j] :
+; tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+; WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+; W[j]:
+; W[j] = WB[j] xor WB[j+4]
+;
+; so we used zmm12~31 20 numbers ZMM to keep WB
+; it is because once we calc W[j] value, we need
+; WB[j - 16] to WB[j + 4] , it is 20 WB number.
+;
+; And also we keep the lane into ZMM12~ZMM27
+; once we calc WB value, lane will not work
+;
+%define WB0 zmm12
+%define WB1 zmm13
+%define WB2 zmm14
+%define WB3 zmm15
+%define WB4 zmm16
+%define WB5 zmm17
+%define WB6 zmm18
+%define WB7 zmm19
+
+%define WB8 zmm20
+%define WB9 zmm21
+%define WB10 zmm22
+%define WB11 zmm23
+%define WB12 zmm24
+%define WB13 zmm25
+%define WB14 zmm26
+%define WB15 zmm27
+
+%define WB16 zmm28
+%define WB17 zmm29
+%define WB18 zmm30
+%define WB19 zmm31
+
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+;
+; same as sha256
+;
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+ ;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+ %xdefine TMP_ D
+ %xdefine D C
+ %xdefine C B
+ %xdefine B A
+ %xdefine A TMP3
+ %xdefine TMP3 TMP_
+
+ %xdefine TMP2_ H
+ %xdefine H G
+ %xdefine G F
+ %xdefine F E
+ %xdefine E TMP0
+ %xdefine TMP0 TMP2_
+%endmacro
+
+;
+; P() Save in TMP0
+; used TMP1
+%macro P 1
+%define %%A %1
+ vprold TMP0,%%A,9
+ vprold TMP1,%%A,17
+
+ vpternlogd TMP0,TMP1,%%A,0x96
+
+%endmacro
+
+;
+; P1() Save in TMP0
+; used TMP1
+%macro P1 1
+%define %%A %1
+
+ vprold TMP0,%%A,15
+ vprold TMP1,%%A,23
+
+ vpternlogd TMP0,TMP1,%%A,0x96
+%endmacro
+
+;
+; FF_16() Save in TMP0
+;
+%macro FF_16 3
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I < 16 return (X ^ Y ^ Z)
+ vmovups TMP0,%%X
+ vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+
+;
+; FF_64() Save in TMP0
+; used TMP1
+%macro FF_64 3
+
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I > 16 return (x & y) | (x & z) | (y & z)
+ ; Same as (x & y) | (z & (x | y))
+ vporq TMP0,%%X,%%Y
+ vpandq TMP0,%%Z
+ vpandq TMP1,%%X,%%Y
+ vporq TMP0,TMP1
+%endmacro
+
+
+;
+; GG() Save in TMP0
+; used TMP1
+%macro GG_16 3
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I < 16 return (x ^ y ^ z)
+ vmovups TMP0,%%X
+ vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+%macro GG_64 3
+
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+
+ ; I > 16 return (x & y) | ((~x) & z)
+ vpandq TMP0,%%X,%%Y
+ vpandnd TMP1,%%X,%%Z
+ vporq TMP0,TMP1
+%endmacro
+
+;; void sm3_mb_x16_avx512(SM3_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sm3_mb_x16_avx512)
+sm3_mb_x16_avx512:
+ endbranch
+
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+
+ lea TBL, [TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64] ; mov unsigned
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+ vmovups F, [DIGEST + 5*64]
+ vmovups G, [DIGEST + 6*64]
+ vmovups H, [DIGEST + 7*64]
+
+ xor IDX, IDX
+
+%assign cur_loop 0
+lloop:
+ ;; start message expand
+ ;; Transpose input data
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ ;; stored B(i) to W(1)...W(15)
+ ;; in zmm16....zmm31
+
+ vmovups WB0,[inp0+IDX]
+ vmovups WB1,[inp1+IDX]
+ vmovups WB2,[inp2+IDX]
+ vmovups WB3,[inp3+IDX]
+ vmovups WB4,[inp4+IDX]
+ vmovups WB5,[inp5+IDX]
+ vmovups WB6,[inp6+IDX]
+ vmovups WB7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups WB8, [inp0+IDX]
+ vmovups WB9, [inp1+IDX]
+ vmovups WB10,[inp2+IDX]
+ vmovups WB11,[inp3+IDX]
+ vmovups WB12,[inp4+IDX]
+ vmovups WB13,[inp5+IDX]
+ vmovups WB14,[inp6+IDX]
+ vmovups WB15,[inp7+IDX]
+
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ ; flat shuffle
+ TRANSPOSE16 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, TMP0, TMP1
+
+ ; little endian to big endian
+ vmovdqa32 TMP0, [SHUF_MASK]
+ vpshufb WB0,TMP0
+ vpshufb WB1,TMP0
+ vpshufb WB2,TMP0
+ vpshufb WB3,TMP0
+ vpshufb WB4,TMP0
+ vpshufb WB5,TMP0
+ vpshufb WB6,TMP0
+ vpshufb WB7,TMP0
+ vpshufb WB8,TMP0
+ vpshufb WB9,TMP0
+ vpshufb WB10,TMP0
+ vpshufb WB11,TMP0
+ vpshufb WB12,TMP0
+ vpshufb WB13,TMP0
+ vpshufb WB14,TMP0
+ vpshufb WB15,TMP0
+
+%assign I 0
+%rep 12
+ %assign J I+4
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_16 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_16 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ ;vprold B,9
+ ;vprold F,19
+ ;P TMP2
+ ;ROTATE_ARGS
+
+ %assign I (I+1)
+%endrep
+
+
+;tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+;WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+
+; round 12-16 here
+%rep 4
+ %assign J I+4
+
+ %assign J_3 J-3
+ %assign J_16 J-16
+ %assign J_9 J-9
+ %assign J_13 J-13
+ %assign J_6 J-6
+
+ ; clac WB(I+4)
+ vprold APPEND(WB,J),APPEND(WB,J_3),15
+ vpxord APPEND(WB,J),APPEND(WB,J_16)
+ vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+ P1 APPEND(WB,J)
+
+ vprold APPEND(WB,J),APPEND(WB,J_13),7
+ vpxord APPEND(WB,J),TMP0
+ vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_16 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_16 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ %assign I (I+1)
+%endrep
+
+%rep 48
+ %assign J (((I+4) % 20) + 20)
+
+ %assign J_3 ((J-3) % 20)
+ %assign J_16 ((J-16) % 20)
+ %assign J_9 ((J-9) % 20)
+ %assign J_13 ((J-13) % 20)
+ %assign J_6 ((J-6) % 20)
+
+ %assign I_20 (I % 20)
+ %assign J (((I+4) % 20))
+
+ vprold APPEND(WB,J),APPEND(WB,J_3),15
+ vpxord APPEND(WB,J),APPEND(WB,J_16)
+ vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+ P1 APPEND(WB,J)
+
+ vprold APPEND(WB,J),APPEND(WB,J_13),7
+ vpxord APPEND(WB,J),TMP0
+ vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_64 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I_20)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_64 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I_20),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ %assign I (I+1)
+%endrep
+ ; Xor old digest
+ vpxord A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpxord B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpxord C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpxord D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpxord E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpxord F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpxord G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpxord H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ %assign cur_loop cur_loop+1
+ sub SIZE, 1
+ je last_loop
+
+ jmp lloop
+
+
+last_loop:
+
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+ ; Write out digest
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+ vmovups [DIGEST + 5*64], F
+ vmovups [DIGEST + 6*64], G
+ vmovups [DIGEST + 7*64], H
+
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+
+section .data
+align 64
+TABLE:
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_x16_avx512
+no_sm3_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
new file mode 100644
index 000000000..0c2c9cdee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
@@ -0,0 +1,711 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SM3 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15
+;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+%define SIZE INP_SIZE ; rsi
+
+%define IDX rax
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+%define APPEND(a,b) a %+ b
+
+%define WB0 ymm0
+%define WB1 ymm1
+%define WB2 ymm2
+%define WB3 ymm3
+%define WB4 ymm4
+%define WB5 ymm5
+%define WB6 ymm6
+%define WB7 ymm7
+%define WB8 ymm8
+%define WB9 ymm9
+%define WB10 ymm10
+%define WB11 ymm11
+%define WB12 ymm12
+%define WB13 ymm13
+%define WB14 ymm14
+%define WB15 ymm15
+
+%define WBTMP0 ymm8
+%define WBTMP1 ymm9
+
+%define WBTMP2 ymm0
+%define WBTMP3 ymm1
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+%define F ymm5
+%define G ymm6
+%define H ymm7
+
+%define TMP0 ymm8
+%define TMP1 ymm9
+%define TMP2 ymm10
+
+; W(j) = WB(j) + WB(j+4)
+; Keep WB(j) - W(j+4) to reduce momory read
+%define Wj0 ymm11
+%define Wj1 ymm12
+%define Wj2 ymm13
+%define Wj3 ymm14
+%define Wj4 ymm15
+
+
+%define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register
+%define PTR_SZ 8
+%define SM3_DIGEST_WORD_SIZE 4
+%define MAX_SM3_LANES 8
+%define NUM_SM3_DIGEST_WORDS 8
+%define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .wbtmp resb 69*SZ8
+ .rsp resb 8
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _WBTMP stack_frame.wbtmp
+%define _RSP_SAVE stack_frame.rsp
+
+%define YTMP0 rsp + _WBTMP + 0*SZ8
+%define YTMP1 rsp + _WBTMP + 1*SZ8
+%define YTMP2 rsp + _WBTMP + 2*SZ8
+%define YTMP3 rsp + _WBTMP + 3*SZ8
+%define YTMP4 rsp + _WBTMP + 4*SZ8
+
+%define YTMPI rsp + _WBTMP + I*SZ8
+%define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8
+%define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8
+%define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8
+%define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8
+
+
+%define VMOVPS vmovups
+
+;;;;;;;;
+; same as sha256
+;;;;;;;;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+%macro ROTATE_W 0
+
+ %xdefine TMP_ Wj0
+ %xdefine Wj0 Wj1
+ %xdefine Wj1 Wj2
+ %xdefine Wj2 Wj3
+ %xdefine Wj3 Wj4
+
+ %xdefine Wj4 TMP_
+
+%endmacro
+
+; ROTATE A,B,C,D
+%macro ROTATE_ARGS_AD 0
+
+ %xdefine TMP_ D
+ %xdefine D C
+ %xdefine C B
+ %xdefine B A
+ %xdefine A TMP2
+ %xdefine TMP2 TMP_
+
+%endmacro
+
+%macro ROTATE_ARGS_EH 0
+
+ %xdefine TMP_ H
+ %xdefine H G
+ %xdefine G F
+ %xdefine F E
+ %xdefine E TMP0
+ %xdefine TMP0 TMP_
+
+%endmacro
+
+%macro ROLD 3
+
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, %%imm
+ vpsrld %%reg, %%reg, (32-(%%imm))
+ vpor %%reg, %%reg, %%tmp
+
+%endmacro
+
+%macro ROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, %%imm
+ vpsrld %%reg, %%src, (32-(%%imm))
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+mk_global sm3_mb_x8_avx2,function,internal
+align 16
+sm3_mb_x8_avx2:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the YMM registers
+
+ ; save rsp, allocate 32-byte aligned for local variables
+ mov IDX, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~31
+ mov [rsp + _RSP_SAVE], IDX
+
+ lea TBL,[TABLE]
+
+ ;; load the address of each of the 8 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+ mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+ mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+ mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+ mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+ mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+ mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+ mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+ xor IDX, IDX
+
+%assign cur_loop 0
+lloop:
+
+ ;
+ ; Pre calculate the WB 0..68 an W 0..64
+ ; It will better than calculate WB/W in round method
+ ;
+ ; ps : SHA256(AVX2) calculate WB/W in round method
+ ;
+ ; Pre calculation memory io time:
+ ; read : 68 + 3 * 52(read WB)
+ ; write : 52(write WB17..68)
+ ; Round method calculation memory io time:
+ ; read : 48 * 6(read 6 number of WB each round)
+ ; write : 52 + 64(same as upper)
+ ;
+ VMOVPS WB0,[inp0+IDX]
+ VMOVPS WB1,[inp1+IDX]
+ VMOVPS WB2,[inp2+IDX]
+ VMOVPS WB3,[inp3+IDX]
+ VMOVPS WB4,[inp4+IDX]
+ VMOVPS WB5,[inp5+IDX]
+ VMOVPS WB6,[inp6+IDX]
+ VMOVPS WB7,[inp7+IDX]
+
+ TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1
+ vmovdqa WBTMP0, [SHUF_MASK]
+ vpshufb WB0,WBTMP0
+ vpshufb WB1,WBTMP0
+ vpshufb WB2,WBTMP0
+ vpshufb WB3,WBTMP0
+ vpshufb WB4,WBTMP0
+ vpshufb WB5,WBTMP0
+ vpshufb WB6,WBTMP0
+ vpshufb WB7,WBTMP0
+
+ vmovdqa [YTMP0], WB0
+ vmovdqa [YTMP1], WB1
+
+ VMOVPS WB8,[inp0+IDX + 32]
+ VMOVPS WB9,[inp1+IDX + 32]
+ VMOVPS WB10,[inp2+IDX + 32]
+ VMOVPS WB11,[inp3+IDX + 32]
+ VMOVPS WB12,[inp4+IDX + 32]
+ VMOVPS WB13,[inp5+IDX + 32]
+ VMOVPS WB14,[inp6+IDX + 32]
+ VMOVPS WB15,[inp7+IDX + 32]
+
+ TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3
+ vmovdqa WBTMP2, [SHUF_MASK]
+ vpshufb WB8,WBTMP2
+ vpshufb WB9,WBTMP2
+ vpshufb WB10,WBTMP2
+ vpshufb WB11,WBTMP2
+ vpshufb WB12,WBTMP2
+ vpshufb WB13,WBTMP2
+ vpshufb WB14,WBTMP2
+ vpshufb WB15,WBTMP2
+
+; WB0 WB1 already saved
+%assign I 2
+%rep 14
+ vmovdqa [YTMPI], APPEND(WB,I)
+%assign I (I+1)
+%endrep
+
+ vmovdqa WB0 , [YTMP0]
+ vmovdqa WB1 , [YTMP1]
+
+; Calculate WB 16...67
+%rep 52
+ %assign J (I % 16)
+ %assign J_1 ((I-1) % 16) ;tmp to use
+ %assign J_2 ((I-2) % 16) ;tmp to use
+ %assign J_3 ((I-3) % 16)
+ %assign J_4 ((I-4) % 16) ;tmp to use
+ %assign J_9 ((I-9) % 16)
+ %assign J_13 ((I-13) % 16)
+ %assign J_6 ((I-6) % 16)
+
+ ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_9)
+
+ ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J)
+ ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_1)
+
+ ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_6)
+
+ vmovdqa [YTMPI], APPEND(WB,J)
+
+ vmovdqa APPEND(WB,J_1), [YTMPI_1]
+ vmovdqa APPEND(WB,J_2), [YTMPI_2]
+ vmovdqa APPEND(WB,J_4), [YTMPI_4]
+
+ %assign I (I+1)
+%endrep
+
+ add IDX, 4*4*4
+
+ ; Every round need load A-H
+ ; Because we pre calculate the WB
+ vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE]
+ vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE]
+ vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE]
+ vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE]
+ vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE]
+ vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE]
+ vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE]
+ vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+ vmovdqa Wj0, [YTMP0]
+ vmovdqa Wj1, [YTMP1]
+ vmovdqa Wj2, [YTMP2]
+ vmovdqa Wj3, [YTMP3]
+ vmovdqa Wj4, [YTMP4]
+
+
+%assign I 0
+%rep 16
+
+ ; SS1 - TMP1
+ ROLD_nd TMP0,12,TMP1,A
+ vmovdqa TMP1, [TBL + (I*32)]
+ vpaddd TMP1,E
+ vpaddd TMP1,TMP0
+ ROLD TMP1,7,TMP2
+
+ ; SS2 - TMP2
+ vpxor TMP2,TMP1,TMP0
+
+ ; TT1
+ vpxor TMP0,A,B
+ vpxor TMP0,C
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,D
+ vpxor TMP0,Wj0,Wj4
+ vpaddd TMP2,TMP0
+
+ ROLD B,9,TMP0
+
+ ; Rotate a,b,c,d first
+ ; after P0(TT2) , Wj0 will be relase
+ ROTATE_ARGS_AD
+
+ ; P0(TT2)
+ vpxor TMP0,E,F
+ vpxor TMP0,G
+ vpaddd TMP0,H
+ vpaddd TMP0,TMP1
+ vpaddd TMP0,Wj0
+
+ ROLD_nd TMP1,9,TMP2,TMP0
+ ROLD_nd Wj0,17,TMP2,TMP0
+
+ vpxor TMP0,TMP1
+ vpxor TMP0,Wj0
+
+ ROLD F,19,TMP2
+
+ ROTATE_ARGS_EH
+
+ ROTATE_W
+
+ vmovdqa Wj4, [YTMPI5]
+ %assign I (I+1)
+%endrep
+
+%rep 48
+ ; SS1 - TMP1
+ ROLD_nd TMP0,12,TMP1,A
+ vmovdqa TMP1, [TBL + (I*32)]
+ vpaddd TMP1,E
+ vpaddd TMP1,TMP0
+ ROLD TMP1,7,TMP2
+
+ ; SS2 - TMP2
+ vpxor TMP2,TMP1,TMP0
+
+ ; SS2 + D first
+ ; D will be release
+ ; FF16/GG16 diff with FF64/GG64
+ ; So the register which keep D should be release before calculate TT1
+ vpaddd TMP2,D
+
+ ; TT1
+ vpor TMP0,A,B
+ vpand TMP0,C
+ vpand D,A,B
+ vpor TMP0,D
+
+ vpaddd TMP2,TMP0
+ vpxor TMP0,Wj0,Wj4
+ vpaddd TMP2,TMP0
+
+ ROLD B,9,TMP0
+
+ ROTATE_ARGS_AD
+
+ ; P0(TT2)
+ vpaddd TMP1,H
+ vpaddd TMP1,Wj0
+
+ vpand TMP0,E,F
+ vpandn Wj0,E,G
+ vpor TMP0,Wj0
+
+ vpaddd TMP0,TMP1
+
+ ROLD_nd TMP1,9,TMP2,TMP0
+ ROLD_nd Wj0,17,TMP2,TMP0
+
+ vpxor TMP0,TMP1
+ vpxor TMP0,Wj0
+
+ ROLD F,19,TMP2
+
+ ROTATE_ARGS_EH
+
+ ROTATE_W
+ vmovdqa Wj4, [YTMPI5]
+ %assign I (I+1)
+%endrep
+
+ vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE]
+ vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE]
+ vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE]
+ vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE]
+ vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE]
+ vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE]
+ vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE]
+ vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+ ; Write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A
+ vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B
+ vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C
+ vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D
+ vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E
+ vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F
+ vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G
+ vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H
+
+ sub SIZE, 1
+ je last_loop
+ jmp lloop
+
+last_loop:
+
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _args_data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _args_data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _args_data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _args_data_ptr + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _args_data_ptr + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _args_data_ptr + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _args_data_ptr + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _args_data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 64
+global TABLE
+TABLE:
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
new file mode 100644
index 000000000..482876539
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
@@ -0,0 +1,81 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+extern sm3_ctx_mgr_init_base
+extern sm3_ctx_mgr_submit_base
+extern sm3_ctx_mgr_flush_base
+
+extern sm3_ctx_mgr_init_avx2
+extern sm3_ctx_mgr_submit_avx2
+extern sm3_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sm3_ctx_mgr_init_avx512
+ extern sm3_ctx_mgr_submit_avx512
+ extern sm3_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_flush
+
+;; have not imlement see/avx yet
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+ sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_avx2, \
+ sm3_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+ sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_avx2, \
+ sm3_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+ sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_avx2, \
+ sm3_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+ sm3_ctx_mgr_init_base,sm3_ctx_mgr_init_avx2
+ mbin_dispatch_init sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+ sm3_ctx_mgr_submit_base,sm3_ctx_mgr_submit_avx2
+ mbin_dispatch_init sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+ sm3_ctx_mgr_flush_base,sm3_ctx_mgr_flush_avx2
+%endif
+
+;;; func core, ver, snum
+slversion sm3_ctx_mgr_init, 00, 00, 2300
+slversion sm3_ctx_mgr_submit, 00, 00, 2301
+slversion sm3_ctx_mgr_flush, 00, 00, 2302
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
new file mode 100644
index 000000000..be56350b3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
@@ -0,0 +1,207 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t digest_sm3[SM3_DIGEST_NWORDS];
+
+#define MSGS 2
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abc";
+static uint8_t msg2[] = "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd";
+
+/* small endian */
+static digest_sm3 exp_result_digest1 = { 0x66c7f0f4, 0x62eeedd9, 0xd1f2d46b, 0xdc10e4e2,
+ 0x4167c487, 0x5cf2f7a2, 0x297da02b, 0x8f4ba8e0
+};
+
+/* small endian */
+static digest_sm3 exp_result_digest2 = { 0xdebe9ff9, 0x2275b8a1, 0x38604889, 0xc18e5a4d,
+ 0x6fdb70e5, 0x387e5765, 0x293dcba3, 0x9c0c5732
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2 };
+
+static uint32_t *exp_result_digest[MSGS] = {
+ exp_result_digest1, exp_result_digest2
+};
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = exp_result_digest[t];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = exp_result_digest[t];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = exp_result_digest[k];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ }
+ }
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = exp_result_digest[k];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sm3 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
new file mode 100644
index 000000000..4c0c54436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
@@ -0,0 +1,45 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <openssl/evp.h>
+
+void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest)
+{
+ EVP_MD_CTX *md_ctx;
+ const EVP_MD *md;
+ unsigned int md_len;
+
+ md = EVP_sm3();
+ md_ctx = EVP_MD_CTX_new();
+ EVP_DigestInit_ex(md_ctx, md, NULL);
+ EVP_DigestUpdate(md_ctx, buf, length);
+ EVP_DigestFinal_ex(md_ctx, digest, &md_len);
+ EVP_MD_CTX_free(md_ctx);
+}