diff options
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sm3_mb')
35 files changed, 9038 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am new file mode 100644 index 000000000..8f8a3f4a6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am @@ -0,0 +1,121 @@ +######################################################################## +# Copyright(c) 2011-2020 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sm3_mb/sm3_ctx_base.c \ + sm3_mb/sm3_multibinary.asm + +lsrc_base_aliases += sm3_mb/sm3_ctx_base.c \ + sm3_mb/sm3_ctx_base_aliases.c + +lsrc_aarch64 += sm3_mb/sm3_ctx_base.c \ + sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c \ + sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S \ + sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c \ + sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c \ + sm3_mb/aarch64/sm3_mb_sm_x1.S \ + sm3_mb/aarch64/sm3_mb_sm_x2.S \ + sm3_mb/aarch64/sm3_mb_sm_x3.S \ + sm3_mb/aarch64/sm3_mb_sm_x4.S \ + sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c \ + sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c \ + sm3_mb/aarch64/sm3_mb_asimd_x1.S \ + sm3_mb/aarch64/sm3_mb_asimd_x4.S + + +src_include += -I $(srcdir)/sm3_mb + +extern_hdrs += include/sm3_mb.h \ + include/multi_buffer.h + +lsrc_x86_64 += sm3_mb/sm3_ctx_avx512.c \ + sm3_mb/sm3_mb_mgr_submit_avx512.asm \ + sm3_mb/sm3_mb_mgr_flush_avx512.asm \ + sm3_mb/sm3_mb_x16_avx512.asm + +lsrc_x86_64 += sm3_mb/sm3_ctx_avx2.c \ + sm3_mb/sm3_mb_mgr_submit_avx2.asm \ + sm3_mb/sm3_mb_mgr_flush_avx2.asm \ + sm3_mb/sm3_mb_x8_avx2.asm + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + include/reg_sizes.asm \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h \ + sm3_mb/sm3_job.asm \ + sm3_mb/sm3_mb_mgr_datastruct.asm \ + sm3_mb/sm3_test_helper.c + +check_tests += sm3_mb/sm3_ref_test + +unit_tests += sm3_mb/sm3_mb_rand_ssl_test \ + sm3_mb/sm3_mb_rand_test \ + sm3_mb/sm3_mb_rand_update_test \ + sm3_mb/sm3_mb_flush_test \ + sm3_mb/sm3_mb_test + +perf_tests += sm3_mb/sm3_mb_vs_ossl_perf \ + sm3_mb/sm3_mb_vs_ossl_shortage_perf + +sm3_mb_rand_ssl_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sm3_mb_rand_ssl_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_ssl_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_rand_update_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_update_test_LDFLAGS = -lcrypto + +sm3_mb_rand_update_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_update_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_flush_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_flush_test_LDFLAGS = -lcrypto + +sm3_mb_flush_test: sm3_test_helper.o +sm3_mb_sm3_mb_flush_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_rand_test: LDLIBS += -lcrypto +sm3_mb_sm3_mb_rand_test_LDFLAGS = -lcrypto + +sm3_mb_rand_test: sm3_test_helper.o +sm3_mb_sm3_mb_rand_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_vs_ossl_perf: LDLIBS += -lcrypto +sm3_mb_sm3_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sm3_mb_vs_ossl_perf: sm3_test_helper.o +sm3_mb_sm3_mb_vs_ossl_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la + +sm3_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto + +sm3_mb_vs_ossl_shortage_perf: sm3_test_helper.o +sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..208a7414e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2019-2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_submit_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_submit_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_init_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_init_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SM3) + return PROVIDER_INFO(sm3_ctx_mgr_flush_sm); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(sm3_ctx_mgr_flush_asimd); + + return PROVIDER_BASIC(sm3_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S new file mode 100644 index 000000000..c7362de90 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S @@ -0,0 +1,387 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + + job .req x0 + len .req x1 + data .req x2 + digest .req x0 + + msg0 .req w3 + msg1 .req w4 + msg2 .req w5 + msg3 .req w6 + msg4 .req w7 + + msg .req w9 + msgP .req w10 + SS1 .req w11 + SS2 .req w12 + TT1 .req w13 + TT2 .req w14 + Tj .req w15 + tmp0 .req w19 + tmp1 .req w20 + dig_A .req w21 + dig_B .req w22 + dig_C .req w23 + dig_D .req w24 + dig_E .req w25 + dig_F .req w26 + dig_G .req w27 + dig_H .req w28 + + declare_var_vector_reg dig0,0 + declare_var_vector_reg dig1,1 + declare_var_vector_reg dig0_bak,2 + declare_var_vector_reg dig1_bak,3 + declare_var_vector_reg vect_msg0,4 + declare_var_vector_reg vect_msg1,5 + declare_var_vector_reg vect_msg2,6 + declare_var_vector_reg vect_msg3,7 + + declare_var_vector_reg vect_msgP0,16 + declare_var_vector_reg vect_msgP1,17 + declare_var_vector_reg vect_msgP2,18 + + + + + + +// round 0-11 +.macro sm3_round_0 round:req + ldr msg, [sp,msg_off+4*\round\()] + ldr msgP,[sp,wp_off +4*\round\()] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,msgP + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor TT1,TT1,dig_C + add SS2,SS2,dig_D + add SS1,SS1,dig_H + add TT1,TT1,SS2 + add TT2,TT2,SS1 + mov dig_D,dig_C + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,(32-1) +.endm + +//round 12-15 +.macro sm3_round_12 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,dig_D + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + eor TT1,TT1,dig_C + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +// round 16-62 +.macro sm3_round_16 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +//round 63 +.macro sm3_round_63 round:req + ldr msg, [sp,msg_off+4*((\round\())%17)] + ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + str msg0,[sp,msg_off+4*((\round\()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ins vdig0_bak.s[3],dig_C + ror dig_C,dig_B,32-9 + ins vdig0_bak.s[1],dig_A + ins vdig0_bak.s[0],TT1 + ins vdig0_bak.s[2],dig_C + eor TT1,TT2,TT2,ror (32-17) + ins vdig1_bak.s[3],dig_G + ror dig_G,dig_F,32-19 + ins vdig1_bak.s[1],dig_E + ins vdig1_bak.s[2],dig_G + eor dig_E,TT1,TT2,ror(32-9) + ins vdig1_bak.s[0],dig_E +.endm + + .set wp_off , 96 + .set msg_off, 96 + 12*4 +#define STACK_SIZE 224 + .global sm3_mb_asimd_x1 + .type sm3_mb_asimd_x1, %function +sm3_mb_asimd_x1: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + ldr data,[job],64 + ldp qdig0,qdig1,[digest] + stp x19, x20, [sp, 16] + stp x21, x22, [sp, 32] + rev32 vdig0.16b,vdig0.16b + stp x23, x24, [sp, 48] + rev32 vdig1.16b,vdig1.16b + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + ble .exit_func + +.start_loop: + + /** prepare first 12 round data **/ + ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64 + mov Tj, 17689 + umov dig_A,vdig0.s[0] + movk Tj, 0x79cc, lsl 16 + rev32 vvect_msg0.16b,vvect_msg0.16b + umov dig_B,vdig0.s[1] + rev32 vvect_msg1.16b,vvect_msg1.16b + umov dig_C,vdig0.s[2] + rev32 vvect_msg2.16b,vvect_msg2.16b + umov dig_D,vdig0.s[3] + rev32 vvect_msg3.16b,vvect_msg3.16b + umov dig_E,vdig1.s[0] + stp qvect_msg0,qvect_msg1,[sp,msg_off] + umov dig_F,vdig1.s[1] + stp qvect_msg2,qvect_msg3,[sp,msg_off+32] + umov dig_G,vdig1.s[2] + eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b + eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b + umov dig_H,vdig1.s[3] + stp qvect_msgP0,qvect_msgP1,[sp,wp_off] + eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b + str qvect_msgP2,[sp,wp_off+32] + + sm3_round_0 0 + sm3_round_0 1 + sm3_round_0 2 + sm3_round_0 3 + sm3_round_0 4 + sm3_round_0 5 + sm3_round_0 6 + sm3_round_0 7 + sm3_round_0 8 + sm3_round_0 9 + sm3_round_0 10 + sm3_round_0 11 + + sm3_round_12 12 + sm3_round_12 13 + sm3_round_12 14 + sm3_round_12 15 + mov Tj, 0x7a87 + movk Tj, 0x9d8a, lsl 16 + sm3_round_16 16 + sm3_round_16 17 + sm3_round_16 18 + sm3_round_16 19 + sm3_round_16 20 + sm3_round_16 21 + sm3_round_16 22 + sm3_round_16 23 + sm3_round_16 24 + sm3_round_16 25 + sm3_round_16 26 + sm3_round_16 27 + sm3_round_16 28 + sm3_round_16 29 + sm3_round_16 30 + sm3_round_16 31 + sm3_round_16 32 + sm3_round_16 33 + sm3_round_16 34 + sm3_round_16 35 + sm3_round_16 36 + sm3_round_16 37 + sm3_round_16 38 + sm3_round_16 39 + sm3_round_16 40 + sm3_round_16 41 + sm3_round_16 42 + sm3_round_16 43 + sm3_round_16 44 + sm3_round_16 45 + sm3_round_16 46 + sm3_round_16 47 + sm3_round_16 48 + sm3_round_16 49 + sm3_round_16 50 + sm3_round_16 51 + sm3_round_16 52 + sm3_round_16 53 + sm3_round_16 54 + sm3_round_16 55 + sm3_round_16 56 + sm3_round_16 57 + sm3_round_16 58 + sm3_round_16 59 + sm3_round_16 60 + sm3_round_16 61 + sm3_round_16 62 + sm3_round_63 63 + subs len,len,1 + eor vdig0.16b,vdig0.16b,vdig0_bak.16b + eor vdig1.16b,vdig1.16b,vdig1_bak.16b + bne .start_loop +.exit_func: + ldp x19, x20, [sp, 16] + rev32 vdig0.16b,vdig0.16b + ldp x21, x22, [sp, 32] + rev32 vdig1.16b,vdig1.16b + ldp x23, x24, [sp, 48] + stp qdig0,qdig1,[digest] + ldp x25, x26, [sp, 64] + ldp x27, x28, [sp, 80] + ldp x29, x30, [sp], STACK_SIZE + ret + .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S new file mode 100644 index 000000000..975a07c7a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S @@ -0,0 +1,576 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + job0_tmp .req x10 + job1_tmp .req x11 + job2_tmp .req x12 + job3_tmp .req x13 + const_adr .req x14 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg msg5,5 + declare_var_vector_reg msg6,6 + declare_var_vector_reg msg7,7 + declare_var_vector_reg msg8,8 + declare_var_vector_reg msg9,9 + declare_var_vector_reg msg10,10 + declare_var_vector_reg msg11,11 + declare_var_vector_reg msg12,12 + declare_var_vector_reg msg13,13 + declare_var_vector_reg msg14,14 + declare_var_vector_reg msg15,15 + declare_var_vector_reg msg16,16 + + + declare_var_vector_reg dig_A,24 + declare_var_vector_reg dig_B,25 + declare_var_vector_reg dig_C,26 + declare_var_vector_reg dig_D,27 + declare_var_vector_reg dig_E,28 + declare_var_vector_reg dig_F,29 + declare_var_vector_reg dig_G,30 + declare_var_vector_reg dig_H,31 + + declare_var_vector_reg TT1,17 + declare_var_vector_reg TT2,18 + declare_var_vector_reg SS1,19 + declare_var_vector_reg SS2,20 + declare_var_vector_reg tmp0,21 + declare_var_vector_reg word_pair,23 + declare_var_vector_reg Tj,22 + + +.macro rol32 target:req,reg:req,bit:req + ushr v\target\().4s,v\reg\().4s,32 - \bit + sli v\target\().4s,v\reg\().4s,\bit +.endm + +// round 0-11 +.macro sm3_round_0 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\round\().16b,vmsg\round\().16b + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + + +.macro sm3_round_4 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + +//round 12-15 +.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +// round 16-62 +.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + mov vTT2.16b,vdig_E.16b + orr vTT1.16b,vdig_B.16b,vdig_C.16b + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +//round 63 +.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + + ldp qmsg0,qmsg1,[sp,dig_off+ 0] + mov vTT2.16b,vdig_E.16b + ldp qmsg2,qmsg3,[sp,dig_off+ 32] + orr vTT1.16b,vdig_B.16b,vdig_C.16b + ldp qmsg4,qmsg5,[sp,dig_off+ 64] + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + ldp qmsg6,qmsg7,[sp,dig_off+ 96] + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + //D=C + eor vdig_D.16b,vdig_C.16b,vmsg3.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + eor vdig_C.16b,vdig_C.16b,vmsg2.16b + //B=A + eor vdig_B.16b,vdig_A.16b,vmsg1.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + //A=TT1 + eor vdig_A.16b,vTT1.16b,vmsg0.16b + // H=G + eor vdig_H.16b,vdig_G.16b,vmsg7.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + eor vdig_G.16b,vdig_G.16b,vmsg6.16b + //F = E + eor vdig_F.16b,vdig_E.16b,vmsg5.16b + stp qdig_G,qdig_H,[sp,dig_off+ 96] + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + eor vdig_E.16b, vdig_E.16b, vmsg4.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] +.endm + + .set dig_off , 80 + +#define STACK_SIZE 224 + .global sm3_mb_asimd_x4 + .type sm3_mb_asimd_x4, %function +sm3_mb_asimd_x4: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + //push d8~d15 + ldr job0_data, [job0],64 + stp d8,d9, [sp,16] + ldr job1_data, [job1],64 + stp d10,d11,[sp,32] + ldr job2_data, [job2],64 + stp d12,d13,[sp,48] + ldr job3_data, [job3],64 + stp d14,d15,[sp,64] + ble .exit_func + + mov job0_tmp,job0_digest + mov job1_tmp,job1_digest + mov job2_tmp,job2_digest + mov job3_tmp,job3_digest + //load digests + ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16 + adrp const_adr, .consts + ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16 + add const_adr, const_adr, #:lo12:.consts + ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp] + rev32 vdig_A.16b,vdig_A.16b + ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp] + rev32 vdig_B.16b,vdig_B.16b + ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp] + rev32 vdig_C.16b,vdig_C.16b + ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp] + rev32 vdig_D.16b,vdig_D.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] + stp qdig_G,qdig_H,[sp,dig_off+ 96] + +.start_loop: + ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16 + ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16 + ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16 + ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16 + ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16 + ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16 + ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16 + ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16 + ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16 + ldr qTj,[const_adr] + + sm3_round_0 0, 4 + + ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16 + sm3_round_0 1, 5 + + ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16 + sm3_round_0 2, 6 + ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16 + sm3_round_0 3, 7 + + ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16 + + sm3_round_4 4, 8 + ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16 + sm3_round_4 5, 9 + ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16 + sm3_round_4 6,10 + ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16 + sm3_round_4 7,11 + sm3_round_4 8,12 + sm3_round_4 9,13 + sm3_round_4 10,14 + sm3_round_4 11,15 + + sm3_round_12 12,16, 0, 7,13, 3,10 //12 + sm3_round_12 13, 0, 1, 8,14, 4,11 //13 + sm3_round_12 14, 1, 2, 9,15, 5,12 //14 + sm3_round_12 15, 2, 3,10,16, 6,13 //15 + + ldr qTj,[const_adr,16] + sm3_round_16 16, 3, 4,11, 0, 7,14 //16 +#if 0 + stp sdig_A,sdig_B,[job0_digest] + stp sdig_C,sdig_D,[job0_digest,8] + stp sdig_E,sdig_F,[job0_digest,16] + stp sdig_G,sdig_H,[job0_digest,24] + b .exit_func +#endif + sm3_round_16 0, 4, 5,12, 1, 8,15 //17 + + sm3_round_16 1, 5, 6,13, 2, 9,16 //18 + sm3_round_16 2, 6, 7,14, 3,10, 0 //19 + sm3_round_16 3, 7, 8,15, 4,11, 1 //20 + sm3_round_16 4, 8, 9,16, 5,12, 2 //21 + sm3_round_16 5, 9,10, 0, 6,13, 3 //22 + sm3_round_16 6,10,11, 1, 7,14, 4 //23 + sm3_round_16 7,11,12, 2, 8,15, 5 //24 + sm3_round_16 8,12,13, 3, 9,16, 6 //25 + sm3_round_16 9,13,14, 4,10, 0, 7 //26 + sm3_round_16 10,14,15, 5,11, 1, 8 //27 + sm3_round_16 11,15,16, 6,12, 2, 9 //28 + sm3_round_16 12,16, 0, 7,13, 3,10 //29 + sm3_round_16 13, 0, 1, 8,14, 4,11 //30 + sm3_round_16 14, 1, 2, 9,15, 5,12 //31 + sm3_round_16 15, 2, 3,10,16, 6,13 //32 + sm3_round_16 16, 3, 4,11, 0, 7,14 //33 + sm3_round_16 0, 4, 5,12, 1, 8,15 //34 + sm3_round_16 1, 5, 6,13, 2, 9,16 //35 + sm3_round_16 2, 6, 7,14, 3,10, 0 //36 + sm3_round_16 3, 7, 8,15, 4,11, 1 //37 + sm3_round_16 4, 8, 9,16, 5,12, 2 //38 + sm3_round_16 5, 9,10, 0, 6,13, 3 //39 + sm3_round_16 6,10,11, 1, 7,14, 4 //40 + sm3_round_16 7,11,12, 2, 8,15, 5 //41 + sm3_round_16 8,12,13, 3, 9,16, 6 //42 + sm3_round_16 9,13,14, 4,10, 0, 7 //43 + sm3_round_16 10,14,15, 5,11, 1, 8 //44 + sm3_round_16 11,15,16, 6,12, 2, 9 //45 + sm3_round_16 12,16, 0, 7,13, 3,10 //46 + sm3_round_16 13, 0, 1, 8,14, 4,11 //47 + sm3_round_16 14, 1, 2, 9,15, 5,12 //48 + sm3_round_16 15, 2, 3,10,16, 6,13 //49 + sm3_round_16 16, 3, 4,11, 0, 7,14 //50 + sm3_round_16 0, 4, 5,12, 1, 8,15 //51 + sm3_round_16 1, 5, 6,13, 2, 9,16 //52 + sm3_round_16 2, 6, 7,14, 3,10, 0 //53 + sm3_round_16 3, 7, 8,15, 4,11, 1 //54 + sm3_round_16 4, 8, 9,16, 5,12, 2 //55 + sm3_round_16 5, 9,10, 0, 6,13, 3 //56 + sm3_round_16 6,10,11, 1, 7,14, 4 //57 + sm3_round_16 7,11,12, 2, 8,15, 5 //58 + sm3_round_16 8,12,13, 3, 9,16, 6 //59 + sm3_round_16 9,13,14, 4,10, 0, 7 //60 + sm3_round_16 10,14,15, 5,11, 1, 8 //61 + sm3_round_16 11,15,16, 6,12, 2, 9 //62 + sm3_round_63 12,16, 0, 7,13, 3,10 //63 + + subs len,len,1 + bne .start_loop + + //save digests with big endian + rev32 vdig_A.16b,vdig_A.16b + rev32 vdig_B.16b,vdig_B.16b + rev32 vdig_C.16b,vdig_C.16b + rev32 vdig_D.16b,vdig_D.16b + st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16 + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16 + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16 + st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16 + st4 {vdig_E.s-vdig_H.s}[0],[job0_digest] + st4 {vdig_E.s-vdig_H.s}[1],[job1_digest] + st4 {vdig_E.s-vdig_H.s}[2],[job2_digest] + st4 {vdig_E.s-vdig_H.s}[3],[job3_digest] + +.exit_func: + ldp d8, d9, [sp,16] + ldp d10,d11,[sp,32] + ldp d12,d13,[sp,48] + ldp d14,d15,[sp,64] + ldp x29, x30, [sp], STACK_SIZE + ret +.consts: + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c new file mode 100644 index 000000000..6e1dff45e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c @@ -0,0 +1,246 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +#define SM3_LOG2_BLOCK_SIZE 6 +void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state); +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_ctx_mgr_init_asimd(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_asimd(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_asimd(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_asimd(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_asimd(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define cpu_to_be32(v) (((v&0xff000000)>>24) | ((v&0xff0000)>>8) | ((v&0xff00)<<8) | ((v&0xff)<<24)) +#else +#define cpu_to_be32(v) +#endif +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { cpu_to_be32(0x7380166f), cpu_to_be32(0x4914b2b9), + cpu_to_be32(0x172442d7), cpu_to_be32(0xda8a0600), + cpu_to_be32(0xa96f30bc), cpu_to_be32(0x163138aa), + cpu_to_be32(0xe38dee4d), cpu_to_be32(0xb0fb0e4e) + }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 + + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c new file mode 100644 index 000000000..5af9ead38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c @@ -0,0 +1,241 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +#define SM3_LOG2_BLOCK_SIZE 6 +void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state); +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_ctx_mgr_init_sm(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_sm(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_sm(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_sm(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_sm(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { to_be32(0x7380166f), to_be32(0x4914b2b9), + to_be32(0x172442d7), to_be32(0xda8a0600), + to_be32(0xa96f30bc), to_be32(0x163138aa), + to_be32(0xe38dee4d), to_be32(0xb0fb0e4e) + }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 + + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c new file mode 100644 index 000000000..48a0d4d0e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c @@ -0,0 +1,188 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stddef.h> +#include <sm3_mb.h> +#include <assert.h> + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SM3_MB_CE_MAX_LANES 4 +void sm3_mb_asimd_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +void sm3_mb_asimd_x1(SM3_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SM3_MB_CE_MAX_LANES is invalid lane + for (; i < SM3_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state) +{ + int lane_idx, len, i; + + if (state->num_lanes_inuse == 0) { + return -1; + } + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + sm3_mb_asimd_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + //only return the min length job + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + } else { + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + len = state->lens[i] & (~0xf); + sm3_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4); + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + return i; + } + } + } + return -1; + +} + +static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state) +{ + int i; + SM3_JOB *ret = NULL; + + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SM3_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SM3_JOB *ret; + + //add job into lanes + sm3_mb_mgr_insert_job(state, job); + + ret = sm3_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sm3_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sm3_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sm3_mb_mgr_free_lane(state); + return ret; +} + +SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state) +{ + SM3_JOB *ret; + ret = sm3_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sm3_mb_mgr_do_jobs(state); + return sm3_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c new file mode 100644 index 000000000..a7178e0be --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stddef.h> +#include <sm3_mb.h> +#include <assert.h> + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SM3_MB_CE_MAX_LANES 4 +#if SM3_MB_CE_MAX_LANES >=4 +void sm3_mb_sm_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +#endif +#if SM3_MB_CE_MAX_LANES >=3 +void sm3_mb_sm_x3(SM3_JOB *, SM3_JOB *, SM3_JOB *, int); +#endif +#if SM3_MB_CE_MAX_LANES >=2 +void sm3_mb_sm_x2(SM3_JOB *, SM3_JOB *, int); +#endif +void sm3_mb_sm_x1(SM3_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SM3_MB_CE_MAX_LANES is invalid lane + for (; i < SM3_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SM3_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SM3_MB_CE_MAX_LANES == 4 + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + + sm3_mb_sm_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + + } else +#elif SM3_MB_CE_MAX_LANES == 3 + if (state->num_lanes_inuse == 3) { + len = min(min(state->lens[0], state->lens[1]), state->lens[2]); + lane_idx = len & 0xf; + len &= ~0xf; + + sm3_mb_sm_x3(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, len >> 4); + + } else +#elif SM3_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + sm3_mb_sm_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SM3_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); +#if SM3_MB_CE_MAX_LANES >=4 + if (lanes == 4) { + sm3_mb_sm_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, len >> 4); + } else +#endif +#if SM3_MB_CE_MAX_LANES >=3 + if (lanes == 3) { + sm3_mb_sm_x3(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, len >> 4); + } else +#endif +#if SM3_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sm3_mb_sm_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sm3_mb_sm_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SM3_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state) +{ + int i; + SM3_JOB *ret = NULL; + + for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SM3_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SM3_JOB *ret; + + //add job into lanes + sm3_mb_mgr_insert_job(state, job); + + ret = sm3_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sm3_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sm3_mb_mgr_do_jobs(state); +#endif + + ret = sm3_mb_mgr_free_lane(state); + return ret; +} + +SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state) +{ + SM3_JOB *ret; + ret = sm3_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sm3_mb_mgr_do_jobs(state); + return sm3_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S new file mode 100644 index 000000000..836bd9ccc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface sm3_ctx_mgr_submit +mbin_interface sm3_ctx_mgr_init +mbin_interface sm3_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S new file mode 100644 index 000000000..f92ac5e9f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S @@ -0,0 +1,237 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + ext v\msg4\().16b, v\msg1\().16b, v\msg2\().16b, #12 + ext v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b, #12 + ext v\tmp1\().16b, v\msg2\().16b, v\msg3\().16b, #8 + sm3partw1 v\msg4\().4s, v\msg0\().4s, v\msg3\().4s + sm3partw2 v\msg4\().4s, v\tmp1\().4s, v\tmp0\().4s + +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req + eor v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b + + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[0] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[0] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[1] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[1] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[2] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[2] + + sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[3] + sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[3] + +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + job .req x0 + len .req x1 + data .req x2 + digest .req x0 + end_ptr .req x1 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg dig0,5 + declare_var_vector_reg dig1,6 + declare_var_vector_reg backup_dig0, 7 + + declare_var_vector_reg tmp0,16 + declare_var_vector_reg tmp1,17 + declare_var_vector_reg backup_dig1, 18 + + declare_var_vector_reg const0,19 + declare_var_vector_reg const1,20 + declare_var_vector_reg const2,21 + declare_var_vector_reg const3,22 + declare_var_vector_reg const4,23 + declare_var_vector_reg const5,24 + declare_var_vector_reg const6,25 + declare_var_vector_reg const7,26 + declare_var_vector_reg const8,27 + declare_var_vector_reg const9,28 + declare_var_vector_reg const10,29 + declare_var_vector_reg const11,30 + + + + + .global sm3_mb_sm_x1 + .type sm3_mb_sm_x1, %function +sm3_mb_sm_x1: + adrp x3,.consts + ldr data, [job],64 + add x3,x3,:lo12:.consts + ldp qdig0,qdig1,[digest] + ld1 {vconst0.16b-vconst3.16b},[x3],64 + add end_ptr,data,len,lsl 6 + ld1 {vconst4.16b-vconst7.16b},[x3],64 + //rev128 + ext vdig0.16b,vdig0.16b,vdig0.16b,#8 + ext vdig1.16b,vdig1.16b,vdig1.16b,#8 + ld1 {vconst8.16b-vconst11.16b},[x3],64 + rev64 vdig0.16b,vdig0.16b + rev64 vdig1.16b,vdig1.16b + + +start_loop: + mov vbackup_dig0.16b,vdig0.16b + mov vbackup_dig1.16b,vdig1.16b + ldp qmsg0,qmsg1,[data],32 + ldp qmsg2,qmsg3,[data],32 + + // big-endian to little-endian + rev32 vmsg0.16b,vmsg0.16b + rev32 vmsg1.16b,vmsg1.16b + rev32 vmsg2.16b,vmsg2.16b + rev32 vmsg3.16b,vmsg3.16b + + quad_round_expand a, const0, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand a, const1, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand a, const2, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const4, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const6, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand b, const8, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const4, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + cmp data,end_ptr + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1 + + eor vdig0.16b,vdig0.16b,vbackup_dig0.16b + eor vdig1.16b,vdig1.16b,vbackup_dig1.16b + + + bcc start_loop + + //rev128 + ext vdig0.16b,vdig0.16b,vdig0.16b,#8 + ext vdig1.16b,vdig1.16b,vdig1.16b,#8 + rev64 vdig0.16b,vdig0.16b + rev64 vdig1.16b,vdig1.16b + str qdig0,[digest] + str qdig1,[digest,16] + ret + dsb ish + isb + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x1, .-sm3_mb_sm_x1 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S new file mode 100644 index 000000000..4e4a6e738 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S @@ -0,0 +1,344 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 v\job\()_\msg4\().4s, v\job\()_\msg0\().4s, v\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 v\job\()_\msg4\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1 + do_ext job\j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1 + do_ext job\j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1 + do_ext job\j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1 + do_sm3partw1 job\j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1 + do_sm3partw2 job\j,\msg4, \tmp1, \tmp0 + .endr + +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req + .irp j,0,1 + do_eor job\j,\tmp0,\msg0,\msg1 + .endr + .irp lane,0,1,2,3 + .irp j,0,1 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + .irp j,0,1 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + .endr +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + +/* + Variables +*/ + job0 .req x0 + job1 .req x1 + len .req x2 + + job0_data .req x3 + job1_data .req x4 + job0_digest .req x0 + job1_digest .req x1 + + const_adr .req x5 + end_ptr .req x2 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job0_msg1, 1 + declare_var_vector_reg job0_msg2, 2 + declare_var_vector_reg job0_msg3, 3 + declare_var_vector_reg job0_msg4, 4 + declare_var_vector_reg job0_dig0, 5 + declare_var_vector_reg job0_dig1, 6 + declare_var_vector_reg job0_tmp0, 7 + declare_var_vector_reg job0_tmp1, 8 + declare_var_vector_reg job0_backup_dig0, 9 + declare_var_vector_reg job0_backup_dig1, 10 + + declare_var_vector_reg job1_msg0, 11 + declare_var_vector_reg job1_msg1, 12 + declare_var_vector_reg job1_msg2, 13 + declare_var_vector_reg job1_msg3, 14 + declare_var_vector_reg job1_msg4, 15 + declare_var_vector_reg job1_dig0, 16 + declare_var_vector_reg job1_dig1, 17 + declare_var_vector_reg job1_tmp0, 18 + declare_var_vector_reg job1_tmp1, 19 + declare_var_vector_reg job1_backup_dig0, 20 + declare_var_vector_reg job1_backup_dig1, 21 + + declare_var_vector_reg const0, 22 + declare_var_vector_reg const1, 23 + declare_var_vector_reg const2, 24 + declare_var_vector_reg const3, 25 + declare_var_vector_reg const4, 26 + declare_var_vector_reg const5, 27 + declare_var_vector_reg const6, 28 + declare_var_vector_reg const7, 29 + declare_var_vector_reg const8, 30 + declare_var_vector_reg const9, 31 + declare_var_vector_reg const10, 22 + declare_var_vector_reg const11, 23 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm +.macro rev32_msgs + .irp j,0,1 + do_rev32_job job\j + .endr +.endm + + + .global sm3_mb_sm_x2 + .type sm3_mb_sm_x2, %function +sm3_mb_sm_x2: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + adrp const_adr,.consts + ldr job0_data, [job0],64 + add const_adr,const_adr,:lo12:.consts + ldr job1_data, [job1],64 + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + + ldp qconst2,qconst3,[const_adr,32] + ldp qconst4,qconst5,[const_adr,64] + ldp qconst6,qconst7,[const_adr,96] + ldp qconst8,qconst9,[const_adr,128] + add end_ptr,job0_data,len,lsl 6 + + //rev128 + ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8 + ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8 + rev64 vjob0_dig0.16b,vjob0_dig0.16b + rev64 vjob0_dig1.16b,vjob0_dig1.16b + ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8 + ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8 + rev64 vjob1_dig0.16b,vjob1_dig0.16b + rev64 vjob1_dig1.16b,vjob1_dig1.16b + + + + + +start_loop: + + ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64 + ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64 + + mov vjob0_backup_dig0.16b,vjob0_dig0.16b + mov vjob0_backup_dig1.16b,vjob0_dig1.16b + mov vjob1_backup_dig0.16b,vjob1_dig0.16b + mov vjob1_backup_dig1.16b,vjob1_dig1.16b + + // const10,const11,const0,const1 share registers + ldp qconst0,qconst1,[const_adr] + + // big-endian to little-endian + rev32_msgs + + cmp job0_data,end_ptr + quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + + + quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + // const10,const11,const0,const1 share registers + ldp qconst10,qconst11,[const_adr,160] + quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1 + + eor vjob0_dig0.16b,vjob0_dig0.16b,vjob0_backup_dig0.16b + eor vjob0_dig1.16b,vjob0_dig1.16b,vjob0_backup_dig1.16b + eor vjob1_dig0.16b,vjob1_dig0.16b,vjob1_backup_dig0.16b + eor vjob1_dig1.16b,vjob1_dig1.16b,vjob1_backup_dig1.16b + + + bcc start_loop + + //rev128 + ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8 + ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8 + rev64 vjob0_dig0.16b,vjob0_dig0.16b + rev64 vjob0_dig1.16b,vjob0_dig1.16b + stp qjob0_dig0,qjob0_dig1,[job0_digest] + + ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8 + ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8 + rev64 vjob1_dig0.16b,vjob1_dig0.16b + rev64 vjob1_dig1.16b,vjob1_dig1.16b + stp qjob1_dig0,qjob1_dig1,[job1_digest] + +#if 1 + mov v0.16b,vjob1_dig0.16b + mov v1.16b,vjob1_dig1.16b + b exit_ret +#endif + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x2, .-sm3_mb_sm_x2 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S new file mode 100644 index 000000000..58758f98d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S @@ -0,0 +1,368 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + //dsdf + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1,2 + do_ext \j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1,2 + do_ext \j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1,2 + do_ext \j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1,2 + do_sm3partw1 \j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1,2 + do_sm3partw2 \j,\msg4, \tmp1, \tmp0 + .endr + +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm +.macro do_ld_backup_digest job + ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off] +.endm + +.macro do_st_digest job + stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest] +.endm +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,load_digest + .irp j,0,1,2 + do_eor job\j,\tmp0,\msg0,\msg1 + .ifnb \load_digest + do_ld_backup_digest \j + .endif + .endr + .irp lane,0,1,2,3 + .irp j,0,1,2 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1,2 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + .irp j,0,1,2 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + + .endr +.endm + +.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1 + quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1 +.endm + +/* + Variables +*/ + job0 .req x0 + job1 .req x1 + job2 .req x2 + len .req x3 + + job0_data .req x4 + job1_data .req x5 + job2_data .req x6 + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + + const_adr .req x7 + end_ptr .req x3 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job0_msg1, 1 + declare_var_vector_reg job0_msg2, 2 + declare_var_vector_reg job0_msg3, 3 + declare_var_vector_reg job0_msg4, 4 + declare_var_vector_reg job0_dig0, 5 + declare_var_vector_reg job0_dig1, 6 + declare_var_vector_reg job0_tmp0, 7 + declare_var_vector_reg job0_tmp1, 8 + .set job0_dig_off, 64 + declare_var_vector_reg job0_backup_dig0, 2 + declare_var_vector_reg job0_backup_dig1, 3 + + declare_var_vector_reg job1_msg0, 9 + declare_var_vector_reg job1_msg1, 10 + declare_var_vector_reg job1_msg2, 11 + declare_var_vector_reg job1_msg3, 12 + declare_var_vector_reg job1_msg4, 13 + declare_var_vector_reg job1_dig0, 14 + declare_var_vector_reg job1_dig1, 15 + declare_var_vector_reg job1_tmp0, 16 + declare_var_vector_reg job1_tmp1, 17 + .set job1_dig_off, 96 + declare_var_vector_reg job1_backup_dig0, 11 + declare_var_vector_reg job1_backup_dig1, 12 + + declare_var_vector_reg job2_msg0, 18 + declare_var_vector_reg job2_msg1, 19 + declare_var_vector_reg job2_msg2, 20 + declare_var_vector_reg job2_msg3, 21 + declare_var_vector_reg job2_msg4, 22 + declare_var_vector_reg job2_dig0, 23 + declare_var_vector_reg job2_dig1, 24 + declare_var_vector_reg job2_tmp0, 25 + declare_var_vector_reg job2_tmp1, 26 + .set job2_dig_off, 128 + declare_var_vector_reg job2_backup_dig0, 20 + declare_var_vector_reg job2_backup_dig1, 21 + + + declare_var_vector_reg const0, 27 + declare_var_vector_reg const1, 28 + declare_var_vector_reg const2, 29 + declare_var_vector_reg const3, 30 + declare_var_vector_reg const4, 27 + declare_var_vector_reg const5, 28 + declare_var_vector_reg const6, 29 + declare_var_vector_reg const7, 30 + declare_var_vector_reg const8, 27 + declare_var_vector_reg const9, 28 + declare_var_vector_reg const10, 29 + declare_var_vector_reg const11, 30 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm +.macro rev32_msgs + .irp j,0,1,2 + do_rev32_job job\j + .endr +.endm + +.macro do_rev64 job,regd,regn + rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b +.endm + + .global sm3_mb_sm_x3 + .type sm3_mb_sm_x3, %function +sm3_mb_sm_x3: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + adrp const_adr,.consts + ldr job0_data, [job0],64 + add const_adr,const_adr,:lo12:.consts + ldr job1_data, [job1],64 + ldr job2_data, [job2],64 + + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + ldp qjob2_dig0,qjob2_dig1,[job2_digest] + ld1 {vconst0.16b-vconst3.16b},[const_adr] + add end_ptr,job0_data,len,lsl 6 + + //rev128 + .irp j,0,1,2 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + .endr + + + + + +start_loop: + + ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64 + stp qjob0_dig0,qjob0_dig1,[sp,job0_dig_off] + ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64 + stp qjob1_dig0,qjob1_dig1,[sp,job1_dig_off] + ld1 {vjob2_msg0.16b-vjob2_msg3.16b},[job2_data],64 + stp qjob2_dig0,qjob2_dig1,[sp,job2_dig_off] + + cmp job0_data,end_ptr + + // big-endian to little-endian + rev32_msgs + + quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + + ldp qconst4,qconst5,[const_adr,4*16] + quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + + ldp qconst6,qconst7,[const_adr,6*16] + quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + ldp qconst8,qconst9,[const_adr,8*16] + quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + ldp qconst10,qconst11,[const_adr,10*16] + quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + + ldp qconst4,qconst5,[const_adr,4*16] + quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + ldp qconst6,qconst7,[const_adr,6*16] + quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1 + ldp qconst0,qconst1,[const_adr] + quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1 + + quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1,1 + ldp qconst2,qconst3,[const_adr,2*16] + + .irp j,0,1,2 + do_eor job\j,dig0,dig0,backup_dig0 + do_eor job\j,dig1,dig1,backup_dig1 + .endr + + bcc start_loop + + //rev128 + .irp j,0,1,2 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + do_st_digest \j + .endr + + + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x3, .-sm3_mb_sm_x3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S new file mode 100644 index 000000000..7f3f1db66 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S @@ -0,0 +1,440 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sm4 + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + +.macro do_ext job,arg0,arg1,arg2,arg3 + ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3 +.endm +.macro do_sm3partw1 job,msg4,msg0,msg3 + sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s +.endm +.macro do_sm3partw2 job,msg4,tmp1,tmp0 + sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s +.endm + +.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req + .irp j,0,1,2,3 + do_ext \j,\msg4,\msg1,\msg2,#12 + .endr + .irp j,0,1,2,3 + do_ext \j,\tmp0,\msg0,\msg1,#12 + .endr + .irp j,0,1,2,3 + do_ext \j,\tmp1,\msg2,\msg3,#8 + .endr + + .irp j,0,1,2,3 + do_sm3partw1 \j,\msg4, \msg0, \msg3 + .endr + .irp j,0,1,2,3 + do_sm3partw2 \j,\msg4, \tmp1, \tmp0 + .endr + st1 {vjob0_\msg4\().16b-vjob3_\msg4\().16b},[data_buf],64 +.endm + +.macro do_eor job,arg0,arg1,arg2 + eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b +.endm +.macro do_sm3ss1 job,tmp1,dig0,dig1,const + sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s +.endm + +.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane + sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane] + +.endm +.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane + sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane] +.endm +.macro do_ld_backup_digest job + ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off] +.endm + +.macro do_st_digest job + stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest] +.endm + +.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,is_last + .ifnb \is_last + ld1 {vjob0_backup_dig0.16b-vjob3_backup_dig0.16b},[dig_buf],64 + .endif + + .irp j,0,1,2,3 + do_eor job\j,\tmp0,\msg0,\msg1 + + .endr + + .irp lane,0,1,2 + .irp j,0,1,2,3 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .irp j,0,1,2,3 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane + .endr + .irp j,0,1,2,3 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane + .endr + + + .endr + .irp j,0,1,2,3 + do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const + .endr + .ifnb \is_last + + ld1 {vjob0_backup_dig1.16b-vjob3_backup_dig1.16b},[dig_buf] + .else + ext v\const\().16b,v\const\().16b,v\const\().16b,12 + .endif + .irp j,0,1,2,3 + do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,3 + .endr + + .irp j,0,1,2,3 + do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,3 + .ifnb \is_last + do_eor job\j,dig1,dig1,backup_dig1 + do_eor job\j,dig0,dig0,backup_dig0 + .endif + .endr + + .ifb \is_last + ld1 {vjob0_\msg0\().16b-vjob3_\msg0\().16b},[data_buf],64 + .endif + +.endm + + + +/* + Variables +*/ + .set temp_buf_size,(68*4+32)*4 + .set dig_buf_off,64 + .set data_buf_off,64+32*4 + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + + const_adr .req x10 + end_ptr .req x4 + data_buf .req x11 + dig_buf .req x12 + + declare_var_vector_reg job0_msg0, 0 + declare_var_vector_reg job1_msg0, 1 + declare_var_vector_reg job2_msg0, 2 + declare_var_vector_reg job3_msg0, 3 + declare_var_vector_reg job0_msg1, 4 + declare_var_vector_reg job1_msg1, 5 + declare_var_vector_reg job2_msg1, 6 + declare_var_vector_reg job3_msg1, 7 + declare_var_vector_reg job0_msg2, 8 + declare_var_vector_reg job1_msg2, 9 + declare_var_vector_reg job2_msg2, 10 + declare_var_vector_reg job3_msg2, 11 + declare_var_vector_reg job0_msg3, 12 + declare_var_vector_reg job1_msg3, 13 + declare_var_vector_reg job2_msg3, 14 + declare_var_vector_reg job3_msg3, 15 + declare_var_vector_reg job0_tmp0, 16 + declare_var_vector_reg job1_tmp0, 17 + declare_var_vector_reg job2_tmp0, 18 + declare_var_vector_reg job3_tmp0, 19 + declare_var_vector_reg job0_tmp1, 20 + declare_var_vector_reg job1_tmp1, 21 + declare_var_vector_reg job2_tmp1, 22 + declare_var_vector_reg job3_tmp1, 23 + declare_var_vector_reg job0_msg4, 24 + declare_var_vector_reg job1_msg4, 25 + declare_var_vector_reg job2_msg4, 26 + declare_var_vector_reg job3_msg4, 27 + declare_var_vector_reg job0_dig0, 8 + declare_var_vector_reg job1_dig0, 9 + declare_var_vector_reg job2_dig0, 10 + declare_var_vector_reg job3_dig0, 11 + declare_var_vector_reg job0_dig1, 12 + declare_var_vector_reg job1_dig1, 13 + declare_var_vector_reg job2_dig1, 14 + declare_var_vector_reg job3_dig1, 15 + + declare_var_vector_reg job0_backup_dig0, 24 + declare_var_vector_reg job1_backup_dig0, 25 + declare_var_vector_reg job2_backup_dig0, 26 + declare_var_vector_reg job3_backup_dig0, 27 + declare_var_vector_reg job0_backup_dig1, 28 + declare_var_vector_reg job1_backup_dig1, 29 + declare_var_vector_reg job2_backup_dig1, 30 + declare_var_vector_reg job3_backup_dig1, 31 + + declare_var_vector_reg const0, 24 + declare_var_vector_reg const1, 25 + declare_var_vector_reg const2, 26 + declare_var_vector_reg const3, 27 + declare_var_vector_reg const4, 28 + declare_var_vector_reg const5, 29 + declare_var_vector_reg const6, 30 + declare_var_vector_reg const7, 31 + declare_var_vector_reg const8, 24 + declare_var_vector_reg const9, 25 + declare_var_vector_reg const10, 26 + declare_var_vector_reg const11, 27 + +.macro do_rev32_msg job:req,msg:req + rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b +.endm + +.macro do_rev32_job job:req + .irp m,0,1,2,3 + do_rev32_msg \job,msg\m + .endr +.endm + +.macro rev32_msgs + .irp j,0,1,2,3 + do_rev32_job job\j + .endr +.endm + +.macro do_rev64 job,regd,regn + rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b +.endm + +.macro do_ldp_msg23 job + ldp qjob\job\()_msg2,qjob\job\()_msg3,[job\job\()_data],32 +.endm + + .global sm3_mb_sm_x4 + .type sm3_mb_sm_x4, %function +sm3_mb_sm_x4: + //push d8~d15 + sub sp,sp,temp_buf_size + stp d8,d9,[sp,-64]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + + + ldr job0_data, [job0],64 + ldr job1_data, [job1],64 + ldr job2_data, [job2],64 + ldr job3_data, [job3],64 + + ldp qjob0_dig0,qjob0_dig1,[job0_digest] + ldp qjob1_dig0,qjob1_dig1,[job1_digest] + ldp qjob2_dig0,qjob2_dig1,[job2_digest] + ldp qjob3_dig0,qjob3_dig1,[job3_digest] + add end_ptr,job0_data,len,lsl 6 + //rev128,change digest endian + .irp j,0,1,2,3 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + .endr + + + + +start_loop: + add dig_buf,sp,dig_buf_off + ldp qjob0_msg0,qjob0_msg1,[job0_data],32 + add data_buf,sp,data_buf_off + ldp qjob1_msg0,qjob1_msg1,[job1_data],32 + st1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64 + ldp qjob2_msg0,qjob2_msg1,[job2_data],32 + st1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf] + ldp qjob3_msg0,qjob3_msg1,[job3_data],32 + + .irp j,0,1,2,3 + do_ldp_msg23 \j + do_rev32_msg job\j,msg0 + do_rev32_msg job\j,msg1 + .endr + st1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64 + st1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64 + .irp j,0,1,2,3 + do_rev32_msg job\j,msg2 + do_rev32_msg job\j,msg3 + .endr + st1 {vjob0_msg2.16b-vjob3_msg2.16b},[data_buf],64 + st1 {vjob0_msg3.16b-vjob3_msg3.16b},[data_buf],64 + + cmp job0_data,end_ptr + + /** message expand **/ + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1 + message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1 + message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1 + message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1 + message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1 + + /** re-init variables for sm3 rounds **/ + add dig_buf,sp,dig_buf_off + ld1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64 + add data_buf,sp,data_buf_off + ld1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf] + add dig_buf,sp,dig_buf_off + adrp const_adr,.consts + ld1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64 + add const_adr,const_adr,:lo12:.consts + ld1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64 + ld1 {vconst0.16b-vconst3.16b},[const_adr],64 + ld1 {vconst4.16b-vconst7.16b},[const_adr],64 + /** digests rounds **/ + quad_round a, const0 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round a, const1 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round a, const2 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round a, const3 , dig0, dig1, msg1, msg0, tmp0, tmp1 + + /** share registers with vconst0-vconst3 **/ + ld1 {vconst8.16b-vconst11.16b},[const_adr] + + quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const8 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const9 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const10, dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const11, dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1 + quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1 + quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1,1 + + bcc start_loop + + //rev128 + .irp j,0,1,2,3 + do_ext \j,dig0,dig0,dig0,#8 + do_ext \j,dig1,dig1,dig1,#8 + do_rev64 \j,dig0,dig0 + do_rev64 \j,dig1,dig1 + do_st_digest \j + .endr + + + +exit_ret: + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 64 + add sp,sp,temp_buf_size + ret + + .align 2 +.consts: + .word 0xce6228cb // 3 + .word 0xe7311465 // 2 + .word 0xf3988a32 // 1 + .word 0x79cc4519 // 0 + .word 0xe6228cbc // 7 + .word 0x7311465e // 6 + .word 0x3988a32f // 5 + .word 0x9cc45197 // 4 + .word 0x6228cbce //11 + .word 0x311465e7 //10 + .word 0x988a32f3 // 9 + .word 0xcc451979 // 8 + .word 0x228cbce6 //15 + .word 0x11465e73 //14 + .word 0x88a32f39 //13 + .word 0xc451979c //12 + .word 0xec53d43c //19 + .word 0x7629ea1e //18 + .word 0x3b14f50f //17 + .word 0x9d8a7a87 //16 + .word 0xc53d43ce //23 + .word 0x629ea1e7 //22 + .word 0xb14f50f3 //21 + .word 0xd8a7a879 //20 + .word 0x53d43cec //27 + .word 0x29ea1e76 //26 + .word 0x14f50f3b //25 + .word 0x8a7a879d //24 + .word 0x3d43cec5 //31 + .word 0x9ea1e762 //30 + .word 0x4f50f3b1 //29 + .word 0xa7a879d8 //28 + .word 0xd43cec53 //35 + .word 0xea1e7629 //34 + .word 0xf50f3b14 //33 + .word 0x7a879d8a //32 + .word 0x43cec53d //39 + .word 0xa1e7629e //38 + .word 0x50f3b14f //37 + .word 0xa879d8a7 //36 + .word 0x3cec53d4 //43 + .word 0x1e7629ea //42 + .word 0x0f3b14f5 //41 + .word 0x879d8a7a //40 + .word 0xcec53d43 //47 + .word 0xe7629ea1 //46 + .word 0xf3b14f50 //45 + .word 0x79d8a7a8 //44 + .word 0xec53d43c //51 + .word 0x7629ea1e //50 + .word 0x3b14f50f //49 + + + .size sm3_mb_sm_x4, .-sm3_mb_sm_x4 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c new file mode 100644 index 000000000..b1c6ee26b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c @@ -0,0 +1,284 @@ +/********************************************************************** + Copyright(c) 2011-2020 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR * state); + +void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SM3_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} + +void sm3_ctx_mgr_init_avx2(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_avx2(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_avx2(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_avx2(SM3_HASH_CTX_MGR * mgr) +{ + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop. + } +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + unsigned int j; + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + ctx->job.result_digest[j] = + byteswap32(ctx->job.result_digest[j]); + } + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SM3_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + + 1 + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sm3_ctx_mgr_init_avx2_slver_0000; +struct slver sm3_ctx_mgr_init_avx2_slver = { 0x2309, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_avx2_slver_0000; +struct slver sm3_ctx_mgr_submit_avx2_slver = { 0x230a, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_avx2_slver_0000; +struct slver sm3_ctx_mgr_flush_avx2_slver = { 0x230b, 0x00, 0x00 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c new file mode 100644 index 000000000..8169aa170 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c @@ -0,0 +1,292 @@ +/********************************************************************** + Copyright(c) 2011-2020 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SM3_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len); +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx); + +void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state); +SM3_JOB *sm3_mb_mgr_submit_avx512(SM3_MB_JOB_MGR * state, SM3_JOB * job); +SM3_JOB *sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR * state); + +void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SM3_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} + +void sm3_ctx_mgr_init_avx512(SM3_HASH_CTX_MGR * mgr) +{ + sm3_mb_mgr_init_avx512(&mgr->mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_avx512(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // if partial_block_buffer_length != 0 means ctx get extra data + // len < SM3_BLOCK_SIZE means data len < SM3_BLOCK_SIZE + if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) { + + ctx->partial_block_buffer_length = 0; + ctx->job.buffer = ctx->partial_block_buffer; + + ctx->job.len = 1; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + + } + + return sm3_ctx_mgr_resubmit(mgr, ctx); +} + +static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + unsigned int j; + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + ctx->job.result_digest[j] = + byteswap32(ctx->job.result_digest[j]); + } + return ctx; + } + // partial_block_buffer_length must be 0 that means incoming_buffer_length have not be init. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // copy_len will check len % SM3_BLOCK_SIZE ?= 0 + uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1); + + // if mod SM3_BLOCK_SIZE != 0 + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + // store the extra data + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + // after len -= copy_len or copy_len == 0 + assert((len % SM3_BLOCK_SIZE) == 0); + // get the block size , eq len = len / 64 + len >>= SM3_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + // todo make sure should return ? + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + + 1 + SM3_PADLENGTHFIELD_SIZE; + +#if SM3_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_avx512(SM3_HASH_CTX_MGR * mgr) +{ + + SM3_HASH_CTX *ctx; + + while (1) { + ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sm3_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } + +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sm3_ctx_mgr_init_avx512_slver_0000; +struct slver sm3_ctx_mgr_init_avx512_slver = { 0x2306, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_avx512_slver_0000; +struct slver sm3_ctx_mgr_submit_avx512_slver = { 0x2307, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_avx512_slver_0000; +struct slver sm3_ctx_mgr_flush_avx512_slver = { 0x2308, 0x00, 0x00 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c new file mode 100644 index 000000000..e8fcfe08a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c @@ -0,0 +1,314 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "sm3_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define rol32(x, r) (((x)<<(r)) | ((x)>>(32-(r)))) + +static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t OPT_FIX sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void OPT_FIX sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sm3_single(const volatile void *data, uint32_t digest[]); +static inline void hash_init_digest(SM3_WORD_T * digest); + +static inline uint32_t P0(uint32_t X) +{ + return (X ^ (rol32(X, 9)) ^ (rol32(X, 17))); +} + +static inline uint32_t P1(uint32_t X) +{ + return (X ^ (rol32(X, 15)) ^ (rol32(X, 23))); +} + +static inline uint32_t sm3_ff(int j, uint32_t x, uint32_t y, uint32_t z) +{ + return j < 16 ? (x ^ y ^ z) : ((x & y) | (x & z) | (y & z)); +} + +static inline uint32_t sm3_gg(int j, uint32_t x, uint32_t y, uint32_t z) +{ + return j < 16 ? (x ^ y ^ z) : ((x & y) | ((~x) & z)); +} + +static inline void sm3_message_schedule(uint32_t bi[], volatile uint32_t W[], + volatile uint32_t W_B[]) +{ + int j; + volatile uint32_t tmp; + + for (j = 0; j <= 15; j++) { + W[j] = to_be32(bi[j]); + } + + for (; j <= 67; j++) { + tmp = W[j - 16] ^ W[j - 9] ^ rol32(W[j - 3], 15); + W[j] = P1(tmp) ^ (rol32(W[j - 13], 7)) ^ W[j - 6]; + } + + for (j = 0; j < 64; j++) { + W_B[j] = W[j] ^ W[j + 4]; + } + + tmp = 0; +} + +static inline void sm3_compress_step_func(int j, volatile uint32_t * a_p, + volatile uint32_t * b_p, volatile uint32_t * c_p, + volatile uint32_t * d_p, volatile uint32_t * e_p, + volatile uint32_t * f_p, volatile uint32_t * g_p, + volatile uint32_t * h_p, volatile uint32_t W[], + volatile uint32_t W_B[]) +{ + volatile uint32_t SS1, SS2, TT1, TT2; + uint32_t T = j < 16 ? 0x79cc4519 : 0x7a879d8a; + + SS1 = rol32(rol32(*a_p, 12) + *e_p + rol32(T, (j % 32)), 7); + SS2 = SS1 ^ rol32(*a_p, 12); + TT1 = sm3_ff(j, *a_p, *b_p, *c_p) + *d_p + SS2 + W_B[j]; + TT2 = sm3_gg(j, *e_p, *f_p, *g_p) + *h_p + SS1 + W[j]; + *d_p = *c_p; + *c_p = rol32(*b_p, 9); + *b_p = *a_p; + *a_p = TT1; + *h_p = *g_p; + *g_p = rol32(*f_p, 19); + *f_p = *e_p; + *e_p = P0(TT2); + + SS1 = 0; + SS2 = 0; + TT1 = 0; + TT2 = 0; +} + +void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr) +{ +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + if (len % SM3_BLOCK_SIZE != 0) { + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + sm3_init(ctx, buffer, len); + sm3_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + if (len % SM3_BLOCK_SIZE != 0) { + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + sm3_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sm3_update(ctx, buffer, len); + sm3_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sm3_init(ctx, buffer, len); + remain_len = sm3_update(ctx, buffer, len); + sm3_final(ctx, remain_len); + } + + return ctx; +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SM3_BLOCK_SIZE) { + sm3_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SM3_BLOCK_SIZE); + remain_len -= SM3_BLOCK_SIZE; + ctx->total_length += SM3_BLOCK_SIZE; + } + + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len; + uint32_t j; + volatile uint8_t buf[2 * SM3_BLOCK_SIZE] = { 0 }; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy((void *)buf, buffer, i); + buf[i++] = 0x80; + + i = (i > SM3_BLOCK_SIZE - SM3_PADLENGTHFIELD_SIZE ? + 2 * SM3_BLOCK_SIZE : SM3_BLOCK_SIZE); + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sm3_single(buf, digest); + if (i == 2 * SM3_BLOCK_SIZE) { + sm3_single(buf + SM3_BLOCK_SIZE, digest); + } + + /* convert to small-endian for words */ + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + digest[j] = byteswap32(digest[j]); + } + + ctx->status = HASH_CTX_STS_COMPLETE; + memset((void *)buf, 0, sizeof(buf)); +} + +static void sm3_single(const volatile void *data, uint32_t digest[]) +{ + volatile uint32_t a, b, c, d, e, f, g, h; + volatile uint32_t W[68], W_bar[64]; + int j; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + sm3_message_schedule((uint32_t *) data, W, W_bar); + for (j = 0; j < 64; j++) { + sm3_compress_step_func(j, &a, &b, &c, &d, &e, &f, &g, &h, W, W_bar); + } + + digest[0] ^= a; + digest[1] ^= b; + digest[2] ^= c; + digest[3] ^= d; + digest[4] ^= e; + digest[5] ^= f; + digest[6] ^= g; + digest[7] ^= h; + + memset((void *)W, 0, sizeof(W)); + memset((void *)W_bar, 0, sizeof(W_bar)); + + a = 0; + b = 0; + c = 0; + d = 0; + e = 0; + f = 0; + g = 0; + h = 0; +} + +static inline void hash_init_digest(SM3_WORD_T * digest) +{ + static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] = + { SM3_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sm3_ctx_mgr_init_base_slver_0000; +struct slver sm3_ctx_mgr_init_base_slver = { 0x2303, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_submit_base_slver_0000; +struct slver sm3_ctx_mgr_submit_base_slver = { 0x2304, 0x00, 0x00 }; + +struct slver sm3_ctx_mgr_flush_base_slver_0000; +struct slver sm3_ctx_mgr_flush_base_slver = { 0x2305, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c new file mode 100644 index 000000000..d74a4c882 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stdint.h> +#include <string.h> +#include "sm3_mb.h" +#include "memcpy_inline.h" + +extern void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr); +extern SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); +extern SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr); + +void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr) +{ + return sm3_ctx_mgr_init_base(mgr); +} + +SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sm3_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr) +{ + return sm3_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm new file mode 100644 index 000000000..0f2a0f39a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm @@ -0,0 +1,65 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SM3_SB_THRESHOLD_SSE 1 +%define SM3_SB_THRESHOLD_AVX 1 +%define SM3_SB_THRESHOLD_AVX2 1 +%define SM3_SB_THRESHOLD_AVX512 1 +%define SM3_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb +%define SM3_NI_SB_THRESHOLD_AVX512 6 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SM3_JOB_size _FIELD_OFFSET +%assign _SM3_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c new file mode 100644 index 000000000..fbbb2a1a7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SM3_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +// Compare against reference function +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SM3_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SM3_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SM3_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sm3_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SM3_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sm3_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sm3_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf("Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm new file mode 100644 index 000000000..a2319ba14 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm @@ -0,0 +1,77 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SM3 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SM3_ARGS_X16 +;;; name size align +FIELD _digest, 4*8*16, 4 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SM3_ARGS_X4_size _FIELD_OFFSET +%assign _SM3_ARGS_X4_align _STRUCT_ALIGN +%assign _SM3_ARGS_X8_size _FIELD_OFFSET +%assign _SM3_ARGS_X8_align _STRUCT_ALIGN +%assign _SM3_ARGS_X16_size _FIELD_OFFSET +%assign _SM3_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SM3_ARGS_X4_size, _SM3_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..b87bdcba8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm @@ -0,0 +1,258 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "sm3_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sm3_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sm3_mb_x8_avx2 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SM3_JOB* sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sm3_mb_mgr_flush_avx2, function +sm3_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..7feada49f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm @@ -0,0 +1,276 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + + + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sm3_mb_x16_avx512 +;extern sm3_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg1 rdi ; rcx + %define arg2 rsi ; rdx + %define tmp4 rdx +%else + %define arg1 rcx + %define arg2 rdx + %define tmp4 rsi +%endif + + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + + +; SM3_JOB* sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sm3_mb_mgr_flush_avx512, function +sm3_mb_mgr_flush_avx512: + endbranch + + ; Save the stack + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; flush may check here and call x1 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x16_avx512 + ; state and idx are intact + + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + + +; return back stack +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_mgr_flush_avx512 +no_sm3_mb_mgr_flush_avx512: +%endif + +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..ae95faa89 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm @@ -0,0 +1,247 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "memcpy.asm" +%include "sm3_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sm3_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SM3_JOB* sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR *state, SM3_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sm3_mb_mgr_submit_avx2, function +sm3_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..7b7b21287 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm @@ -0,0 +1,273 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_job.asm" +%include "memcpy.asm" +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +; +; SM3_JOB* sm3_mb_mgr_submit_avx512 (SM3_MB_JOB_MGR *state, SM3_JOB* job); +; + +%ifdef HAVE_AS_KNOWS_AVX512 + +;todo sm3_mb_x16_avx512 +extern sm3_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +%define arg1 rdi ; state +%define arg2 rsi ; job + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx ; state +%define arg2 rdx ; job + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 ; +%define len2 arg2 ; + offset +%define p2 arg2 ; need + offset + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 +%define num_lanes_inuse r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + +; todo make sure +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +mk_global sm3_mb_mgr_submit_avx512, function +sm3_mb_mgr_submit_avx512: + endbranch + + ; save these registers + sub rsp, STACK_SPACE + ; rsp contain stack ptr , mov to stack bottom + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp ; unuse 1 2 + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 + ;mov rbx,rbp,r12,r13,r14,r15 to stack +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + ; mov args to rbx and then mov rbx to rbp + ; unused_lanes - rbx , lane - rbp both have already backup + and lane, 0xF + ; unless lane is 0x789abcdef, and return 0 + + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + ; eq jump + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sm3_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +; restore stack +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + + + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_mgr_submit_avx512 +no_sm3_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c new file mode 100644 index 000000000..b904ba0ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c @@ -0,0 +1,160 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + sm3_ossl(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c new file mode 100644 index 000000000..3671a3b79 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c @@ -0,0 +1,206 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +// Compare against reference function +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]); + + // Run sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // Run sm3_mb test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sm3_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + + // sb_sm3 test + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sm3_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c new file mode 100644 index 000000000..64e583ffc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c @@ -0,0 +1,298 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SM3_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SM3_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +// Generates pseudo-random data +static void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sm3_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]); + } + + // Run sb_sm3 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sm3_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sm3_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sm3_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sm3_ossl(bufs[i], lens[i], digest_ref[i]); + } + + sm3_ctx_mgr_init(mgr); + + // Run sm3_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SM3_BLOCK_SIZE + + SM3_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SM3_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sm3_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sm3_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SM3_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sm3_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ref[i])[j])) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ref[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c new file mode 100644 index 000000000..c409530c7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "sm3_mb.h" + +typedef struct { + const char *msg; + uint32_t resultDigest[SM3_DIGEST_NWORDS]; +} TestData; + +static TestData test_data[] = { + { + .msg = "abc", + .resultDigest = {0xf4f0c766, 0xd9edee62, 0x6bd4f2d1, 0xe2e410dc, + 0x87c46741, 0xa2f7f25c, 0x2ba07d29, 0xe0a84b8f} + }, + { + .msg = "abcdabcdabcdabcdabcdabcdabcdabcd" "abcdabcdabcdabcdabcdabcdabcdabcd", + .resultDigest = {0xf99fbede, 0xa1b87522, 0x89486038, 0x4d5a8ec1, + 0xe570db6f, 0x65577e38, 0xa3cb3d29, 0x32570c9c} + + }, + { + .msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", + .resultDigest = {0xc56c9b63, 0x379e4de6, 0x92b190a3, 0xeaa14fdf, + 0x74ab2007, 0xb992f67f, 0x664e8cf3, 0x058c7bad} + }, + + {.msg = "0123456789:;<=>?@ABCDEFGHIJKLMNO", + .resultDigest = {0x076833d0, 0xd089ec39, 0xad857685, 0x8089797a, + 0x9df9e8fd, 0x4126eb9a, 0xf38c22e8, 0x054bb846}}, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<", + .resultDigest = {0x6cb9d38e, 0x846ac99e, 0x6d05634b, 0x3fe1bb26, + 0x90368c4b, 0xee8c4299, 0x08c0e96a, 0x2233cdc7} + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR", + .resultDigest = {0x83758189, 0x050f14d1, 0x91d8a730, 0x4a2825e4, + 0x11723273, 0x2114ee3f, 0x18cac172, 0xa9c5b07a} + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?", + .resultDigest = {0xb80f8aba, 0x55e96119, 0x851ac77b, 0xae31b3a5, + 0x1333e764, 0xc86ac40d, 0x34878db1, 0x7da873f6}, + }, + { + .msg = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU", + .resultDigest = {0xbd5736a7, 0x55977d13, 0xa950c78a, 0x71eeb7cb, + 0xe9ef0ba5, 0x95a9302e, 0x155e5c33, 0xad96ce3c} + }, + { + .msg = "", + .resultDigest = {0x831db21a, 0x7fa1cf55, 0x4819618e, 0x8f1ae831, + 0xc7c8be22, 0x74fbfe28, 0xeb35d07e, 0x2baa8250} + + }, + +}; + +#define MSGS sizeof(test_data)/sizeof(TestData) +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +int main(void) +{ + + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], test_data[i].msg, + strlen((char *)test_data[i].msg), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = test_data[t].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = test_data[t].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + test_data[j].msg, strlen((char *)test_data[j].msg), + HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = test_data[k].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = test_data[k].resultDigest; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sm3 test: Pass\n"); + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c new file mode 100644 index 000000000..ed4d9a092 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c @@ -0,0 +1,128 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sm3_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sm3_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sm3" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sm3 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..025fd90ed --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,133 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sm3_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SM3_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest); + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS]; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sm3_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sm3_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sm3_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sm3" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_le32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_le32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sm3 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sm3_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm new file mode 100644 index 000000000..3b300fa80 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm @@ -0,0 +1,1035 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, 8*64, 64 +FIELD _rsp, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) ; rdi + 8*16 +%define DIGEST state ; rdi +%define SIZE num_blks ; rsi + +%define IDX var1 +%define TBL var2 + +%define APPEND(a,b) a %+ b + + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 + +; +; 4 ZMM for tmp data +; +%define TMP0 zmm8 +%define TMP1 zmm9 +%define TMP2 zmm10 +%define TMP3 zmm11 + +; +; Word W[] will be expand to array size 64 +; Word WB[] will be expand to array size 68 +; WB[j] : +; tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15); +; WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6]; +; W[j]: +; W[j] = WB[j] xor WB[j+4] +; +; so we used zmm12~31 20 numbers ZMM to keep WB +; it is because once we calc W[j] value, we need +; WB[j - 16] to WB[j + 4] , it is 20 WB number. +; +; And also we keep the lane into ZMM12~ZMM27 +; once we calc WB value, lane will not work +; +%define WB0 zmm12 +%define WB1 zmm13 +%define WB2 zmm14 +%define WB3 zmm15 +%define WB4 zmm16 +%define WB5 zmm17 +%define WB6 zmm18 +%define WB7 zmm19 + +%define WB8 zmm20 +%define WB9 zmm21 +%define WB10 zmm22 +%define WB11 zmm23 +%define WB12 zmm24 +%define WB13 zmm25 +%define WB14 zmm26 +%define WB15 zmm27 + +%define WB16 zmm28 +%define WB17 zmm29 +%define WB18 zmm30 +%define WB19 zmm31 + + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +; +; same as sha256 +; +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + + ;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + + +%macro ROTATE_ARGS 0 + %xdefine TMP_ D + %xdefine D C + %xdefine C B + %xdefine B A + %xdefine A TMP3 + %xdefine TMP3 TMP_ + + %xdefine TMP2_ H + %xdefine H G + %xdefine G F + %xdefine F E + %xdefine E TMP0 + %xdefine TMP0 TMP2_ +%endmacro + +; +; P() Save in TMP0 +; used TMP1 +%macro P 1 +%define %%A %1 + vprold TMP0,%%A,9 + vprold TMP1,%%A,17 + + vpternlogd TMP0,TMP1,%%A,0x96 + +%endmacro + +; +; P1() Save in TMP0 +; used TMP1 +%macro P1 1 +%define %%A %1 + + vprold TMP0,%%A,15 + vprold TMP1,%%A,23 + + vpternlogd TMP0,TMP1,%%A,0x96 +%endmacro + +; +; FF_16() Save in TMP0 +; +%macro FF_16 3 +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I < 16 return (X ^ Y ^ Z) + vmovups TMP0,%%X + vpternlogd TMP0,%%Y,%%Z,0x96 +%endmacro + + +; +; FF_64() Save in TMP0 +; used TMP1 +%macro FF_64 3 + +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I > 16 return (x & y) | (x & z) | (y & z) + ; Same as (x & y) | (z & (x | y)) + vporq TMP0,%%X,%%Y + vpandq TMP0,%%Z + vpandq TMP1,%%X,%%Y + vporq TMP0,TMP1 +%endmacro + + +; +; GG() Save in TMP0 +; used TMP1 +%macro GG_16 3 +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + ; I < 16 return (x ^ y ^ z) + vmovups TMP0,%%X + vpternlogd TMP0,%%Y,%%Z,0x96 +%endmacro + +%macro GG_64 3 + +%define %%X %1 +%define %%Y %2 +%define %%Z %3 + + ; I > 16 return (x & y) | ((~x) & z) + vpandq TMP0,%%X,%%Y + vpandnd TMP1,%%X,%%Z + vporq TMP0,TMP1 +%endmacro + +;; void sm3_mb_x16_avx512(SM3_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sm3_mb_x16_avx512) +sm3_mb_x16_avx512: + endbranch + + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _rsp], rax + + lea TBL, [TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] ; mov unsigned + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + vmovups F, [DIGEST + 5*64] + vmovups G, [DIGEST + 6*64] + vmovups H, [DIGEST + 7*64] + + xor IDX, IDX + +%assign cur_loop 0 +lloop: + ;; start message expand + ;; Transpose input data + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + ;; stored B(i) to W(1)...W(15) + ;; in zmm16....zmm31 + + vmovups WB0,[inp0+IDX] + vmovups WB1,[inp1+IDX] + vmovups WB2,[inp2+IDX] + vmovups WB3,[inp3+IDX] + vmovups WB4,[inp4+IDX] + vmovups WB5,[inp5+IDX] + vmovups WB6,[inp6+IDX] + vmovups WB7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups WB8, [inp0+IDX] + vmovups WB9, [inp1+IDX] + vmovups WB10,[inp2+IDX] + vmovups WB11,[inp3+IDX] + vmovups WB12,[inp4+IDX] + vmovups WB13,[inp5+IDX] + vmovups WB14,[inp6+IDX] + vmovups WB15,[inp7+IDX] + + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 64 + + ; flat shuffle + TRANSPOSE16 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, TMP0, TMP1 + + ; little endian to big endian + vmovdqa32 TMP0, [SHUF_MASK] + vpshufb WB0,TMP0 + vpshufb WB1,TMP0 + vpshufb WB2,TMP0 + vpshufb WB3,TMP0 + vpshufb WB4,TMP0 + vpshufb WB5,TMP0 + vpshufb WB6,TMP0 + vpshufb WB7,TMP0 + vpshufb WB8,TMP0 + vpshufb WB9,TMP0 + vpshufb WB10,TMP0 + vpshufb WB11,TMP0 + vpshufb WB12,TMP0 + vpshufb WB13,TMP0 + vpshufb WB14,TMP0 + vpshufb WB15,TMP0 + +%assign I 0 +%rep 12 + %assign J I+4 + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_16 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_16 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I),APPEND(WB,J) + vpaddd TMP3,TMP0 + + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + ;vprold B,9 + ;vprold F,19 + ;P TMP2 + ;ROTATE_ARGS + + %assign I (I+1) +%endrep + + +;tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15); +;WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6]; + +; round 12-16 here +%rep 4 + %assign J I+4 + + %assign J_3 J-3 + %assign J_16 J-16 + %assign J_9 J-9 + %assign J_13 J-13 + %assign J_6 J-6 + + ; clac WB(I+4) + vprold APPEND(WB,J),APPEND(WB,J_3),15 + vpxord APPEND(WB,J),APPEND(WB,J_16) + vpxord APPEND(WB,J),APPEND(WB,J_9) + + P1 APPEND(WB,J) + + vprold APPEND(WB,J),APPEND(WB,J_13),7 + vpxord APPEND(WB,J),TMP0 + vpxord APPEND(WB,J),APPEND(WB,J_6) + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_16 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_16 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I),APPEND(WB,J) + vpaddd TMP3,TMP0 + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + %assign I (I+1) +%endrep + +%rep 48 + %assign J (((I+4) % 20) + 20) + + %assign J_3 ((J-3) % 20) + %assign J_16 ((J-16) % 20) + %assign J_9 ((J-9) % 20) + %assign J_13 ((J-13) % 20) + %assign J_6 ((J-6) % 20) + + %assign I_20 (I % 20) + %assign J (((I+4) % 20)) + + vprold APPEND(WB,J),APPEND(WB,J_3),15 + vpxord APPEND(WB,J),APPEND(WB,J_16) + vpxord APPEND(WB,J),APPEND(WB,J_9) + + P1 APPEND(WB,J) + + vprold APPEND(WB,J),APPEND(WB,J_13),7 + vpxord APPEND(WB,J),TMP0 + vpxord APPEND(WB,J),APPEND(WB,J_6) + + ; (A <<< 12) + ; store in TMP0 + vprold TMP0,A,12 + + ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7 + ; (T(j) <<< j) store in TBL + ; SS1 store in TMP2 + vmovdqa32 TMP2, [TBL + (I*64)] + vpaddd TMP2,E + + vpaddd TMP2,TMP0 + vprold TMP2,7 + + ; SS2 = SS1 ^ (A <<< 12) + ; SS2 store in TMP3 + vpxord TMP3,TMP2,TMP0 + + ; TT2 = GG(E,F,G) + H + SS1 + WB(I) + GG_64 E,F,G + vpaddd TMP2,TMP0 + vpaddd TMP2,H + + vpaddd TMP2,APPEND(WB,I_20) + + ; TT1 = FF(A,B,C) + D + SS2 + W(I) + ; TT1 store in TMP3 + FF_64 A,B,C + vpaddd TMP3,TMP0 + vpaddd TMP3,D + ; W(I) = WB(I) ^ W(I+4) + vpxord TMP0,APPEND(WB,I_20),APPEND(WB,J) + vpaddd TMP3,TMP0 + + ; D = C + ; C = B <<< 9 + ; B = A + ; A = TT1 + ; H = G + ; G = F <<< 19 + ; F = E + ; E = P(TT2) + vmovups D,C + vprold B,9 + vmovups C,B + vmovups B,A + vmovups A,TMP3 + vmovups H,G + vprold F,19 + vmovups G,F + vmovups F,E + P TMP2 + vmovups E,TMP0 + + %assign I (I+1) +%endrep + ; Xor old digest + vpxord A, A, [rsp + _DIGEST_SAVE + 64*0] + vpxord B, B, [rsp + _DIGEST_SAVE + 64*1] + vpxord C, C, [rsp + _DIGEST_SAVE + 64*2] + vpxord D, D, [rsp + _DIGEST_SAVE + 64*3] + vpxord E, E, [rsp + _DIGEST_SAVE + 64*4] + vpxord F, F, [rsp + _DIGEST_SAVE + 64*5] + vpxord G, G, [rsp + _DIGEST_SAVE + 64*6] + vpxord H, H, [rsp + _DIGEST_SAVE + 64*7] + + %assign cur_loop cur_loop+1 + sub SIZE, 1 + je last_loop + + jmp lloop + + +last_loop: + +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + ; Write out digest + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + vmovups [DIGEST + 5*64], F + vmovups [DIGEST + 6*64], G + vmovups [DIGEST + 7*64], H + + + mov rsp, [rsp + _rsp] + ret + + +section .data +align 64 +TABLE: + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + + + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sm3_mb_x16_avx512 +no_sm3_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm new file mode 100644 index 000000000..0c2c9cdee --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm @@ -0,0 +1,711 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sm3_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SM3 using SSE-256 / AVX2 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15 +;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx + %define reg3 rsi + %define reg4 rdi +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 +%define SIZE INP_SIZE ; rsi + +%define IDX rax +%define TBL reg3 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 reg4 + +%define APPEND(a,b) a %+ b + +%define WB0 ymm0 +%define WB1 ymm1 +%define WB2 ymm2 +%define WB3 ymm3 +%define WB4 ymm4 +%define WB5 ymm5 +%define WB6 ymm6 +%define WB7 ymm7 +%define WB8 ymm8 +%define WB9 ymm9 +%define WB10 ymm10 +%define WB11 ymm11 +%define WB12 ymm12 +%define WB13 ymm13 +%define WB14 ymm14 +%define WB15 ymm15 + +%define WBTMP0 ymm8 +%define WBTMP1 ymm9 + +%define WBTMP2 ymm0 +%define WBTMP3 ymm1 + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 +%define F ymm5 +%define G ymm6 +%define H ymm7 + +%define TMP0 ymm8 +%define TMP1 ymm9 +%define TMP2 ymm10 + +; W(j) = WB(j) + WB(j+4) +; Keep WB(j) - W(j+4) to reduce momory read +%define Wj0 ymm11 +%define Wj1 ymm12 +%define Wj2 ymm13 +%define Wj3 ymm14 +%define Wj4 ymm15 + + +%define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register +%define PTR_SZ 8 +%define SM3_DIGEST_WORD_SIZE 4 +%define MAX_SM3_LANES 8 +%define NUM_SM3_DIGEST_WORDS 8 +%define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE) + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ8 + .digest resb 8*SZ8 + .wbtmp resb 69*SZ8 + .rsp resb 8 +endstruc +%define FRAMESZ stack_frame_size +%define _DIGEST stack_frame.digest +%define _WBTMP stack_frame.wbtmp +%define _RSP_SAVE stack_frame.rsp + +%define YTMP0 rsp + _WBTMP + 0*SZ8 +%define YTMP1 rsp + _WBTMP + 1*SZ8 +%define YTMP2 rsp + _WBTMP + 2*SZ8 +%define YTMP3 rsp + _WBTMP + 3*SZ8 +%define YTMP4 rsp + _WBTMP + 4*SZ8 + +%define YTMPI rsp + _WBTMP + I*SZ8 +%define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8 +%define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8 +%define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8 +%define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8 + + +%define VMOVPS vmovups + +;;;;;;;; +; same as sha256 +;;;;;;;; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + +%macro ROTATE_W 0 + + %xdefine TMP_ Wj0 + %xdefine Wj0 Wj1 + %xdefine Wj1 Wj2 + %xdefine Wj2 Wj3 + %xdefine Wj3 Wj4 + + %xdefine Wj4 TMP_ + +%endmacro + +; ROTATE A,B,C,D +%macro ROTATE_ARGS_AD 0 + + %xdefine TMP_ D + %xdefine D C + %xdefine C B + %xdefine B A + %xdefine A TMP2 + %xdefine TMP2 TMP_ + +%endmacro + +%macro ROTATE_ARGS_EH 0 + + %xdefine TMP_ H + %xdefine H G + %xdefine G F + %xdefine F E + %xdefine E TMP0 + %xdefine TMP0 TMP_ + +%endmacro + +%macro ROLD 3 + +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, %%imm + vpsrld %%reg, %%reg, (32-(%%imm)) + vpor %%reg, %%reg, %%tmp + +%endmacro + +%macro ROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, %%imm + vpsrld %%reg, %%src, (32-(%%imm)) + vpor %%reg, %%reg, %%tmp +%endmacro + +;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes); +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of input in blocks +mk_global sm3_mb_x8_avx2,function,internal +align 16 +sm3_mb_x8_avx2: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the YMM registers + + ; save rsp, allocate 32-byte aligned for local variables + mov IDX, rsp + sub rsp, FRAMESZ + and rsp, ~31 + mov [rsp + _RSP_SAVE], IDX + + lea TBL,[TABLE] + + ;; load the address of each of the 8 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ] + mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ] + mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ] + mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ] + mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ] + mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ] + mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ] + mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ] + + xor IDX, IDX + +%assign cur_loop 0 +lloop: + + ; + ; Pre calculate the WB 0..68 an W 0..64 + ; It will better than calculate WB/W in round method + ; + ; ps : SHA256(AVX2) calculate WB/W in round method + ; + ; Pre calculation memory io time: + ; read : 68 + 3 * 52(read WB) + ; write : 52(write WB17..68) + ; Round method calculation memory io time: + ; read : 48 * 6(read 6 number of WB each round) + ; write : 52 + 64(same as upper) + ; + VMOVPS WB0,[inp0+IDX] + VMOVPS WB1,[inp1+IDX] + VMOVPS WB2,[inp2+IDX] + VMOVPS WB3,[inp3+IDX] + VMOVPS WB4,[inp4+IDX] + VMOVPS WB5,[inp5+IDX] + VMOVPS WB6,[inp6+IDX] + VMOVPS WB7,[inp7+IDX] + + TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1 + vmovdqa WBTMP0, [SHUF_MASK] + vpshufb WB0,WBTMP0 + vpshufb WB1,WBTMP0 + vpshufb WB2,WBTMP0 + vpshufb WB3,WBTMP0 + vpshufb WB4,WBTMP0 + vpshufb WB5,WBTMP0 + vpshufb WB6,WBTMP0 + vpshufb WB7,WBTMP0 + + vmovdqa [YTMP0], WB0 + vmovdqa [YTMP1], WB1 + + VMOVPS WB8,[inp0+IDX + 32] + VMOVPS WB9,[inp1+IDX + 32] + VMOVPS WB10,[inp2+IDX + 32] + VMOVPS WB11,[inp3+IDX + 32] + VMOVPS WB12,[inp4+IDX + 32] + VMOVPS WB13,[inp5+IDX + 32] + VMOVPS WB14,[inp6+IDX + 32] + VMOVPS WB15,[inp7+IDX + 32] + + TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3 + vmovdqa WBTMP2, [SHUF_MASK] + vpshufb WB8,WBTMP2 + vpshufb WB9,WBTMP2 + vpshufb WB10,WBTMP2 + vpshufb WB11,WBTMP2 + vpshufb WB12,WBTMP2 + vpshufb WB13,WBTMP2 + vpshufb WB14,WBTMP2 + vpshufb WB15,WBTMP2 + +; WB0 WB1 already saved +%assign I 2 +%rep 14 + vmovdqa [YTMPI], APPEND(WB,I) +%assign I (I+1) +%endrep + + vmovdqa WB0 , [YTMP0] + vmovdqa WB1 , [YTMP1] + +; Calculate WB 16...67 +%rep 52 + %assign J (I % 16) + %assign J_1 ((I-1) % 16) ;tmp to use + %assign J_2 ((I-2) % 16) ;tmp to use + %assign J_3 ((I-3) % 16) + %assign J_4 ((I-4) % 16) ;tmp to use + %assign J_9 ((I-9) % 16) + %assign J_13 ((I-13) % 16) + %assign J_6 ((I-6) % 16) + + ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_9) + + ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J) + ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_1) + + ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13) + vpxor APPEND(WB,J),APPEND(WB,J_2) + vpxor APPEND(WB,J),APPEND(WB,J_6) + + vmovdqa [YTMPI], APPEND(WB,J) + + vmovdqa APPEND(WB,J_1), [YTMPI_1] + vmovdqa APPEND(WB,J_2), [YTMPI_2] + vmovdqa APPEND(WB,J_4), [YTMPI_4] + + %assign I (I+1) +%endrep + + add IDX, 4*4*4 + + ; Every round need load A-H + ; Because we pre calculate the WB + vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE] + vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE] + vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE] + vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE] + vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE] + vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE] + vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE] + vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE] + + vmovdqa Wj0, [YTMP0] + vmovdqa Wj1, [YTMP1] + vmovdqa Wj2, [YTMP2] + vmovdqa Wj3, [YTMP3] + vmovdqa Wj4, [YTMP4] + + +%assign I 0 +%rep 16 + + ; SS1 - TMP1 + ROLD_nd TMP0,12,TMP1,A + vmovdqa TMP1, [TBL + (I*32)] + vpaddd TMP1,E + vpaddd TMP1,TMP0 + ROLD TMP1,7,TMP2 + + ; SS2 - TMP2 + vpxor TMP2,TMP1,TMP0 + + ; TT1 + vpxor TMP0,A,B + vpxor TMP0,C + vpaddd TMP2,TMP0 + vpaddd TMP2,D + vpxor TMP0,Wj0,Wj4 + vpaddd TMP2,TMP0 + + ROLD B,9,TMP0 + + ; Rotate a,b,c,d first + ; after P0(TT2) , Wj0 will be relase + ROTATE_ARGS_AD + + ; P0(TT2) + vpxor TMP0,E,F + vpxor TMP0,G + vpaddd TMP0,H + vpaddd TMP0,TMP1 + vpaddd TMP0,Wj0 + + ROLD_nd TMP1,9,TMP2,TMP0 + ROLD_nd Wj0,17,TMP2,TMP0 + + vpxor TMP0,TMP1 + vpxor TMP0,Wj0 + + ROLD F,19,TMP2 + + ROTATE_ARGS_EH + + ROTATE_W + + vmovdqa Wj4, [YTMPI5] + %assign I (I+1) +%endrep + +%rep 48 + ; SS1 - TMP1 + ROLD_nd TMP0,12,TMP1,A + vmovdqa TMP1, [TBL + (I*32)] + vpaddd TMP1,E + vpaddd TMP1,TMP0 + ROLD TMP1,7,TMP2 + + ; SS2 - TMP2 + vpxor TMP2,TMP1,TMP0 + + ; SS2 + D first + ; D will be release + ; FF16/GG16 diff with FF64/GG64 + ; So the register which keep D should be release before calculate TT1 + vpaddd TMP2,D + + ; TT1 + vpor TMP0,A,B + vpand TMP0,C + vpand D,A,B + vpor TMP0,D + + vpaddd TMP2,TMP0 + vpxor TMP0,Wj0,Wj4 + vpaddd TMP2,TMP0 + + ROLD B,9,TMP0 + + ROTATE_ARGS_AD + + ; P0(TT2) + vpaddd TMP1,H + vpaddd TMP1,Wj0 + + vpand TMP0,E,F + vpandn Wj0,E,G + vpor TMP0,Wj0 + + vpaddd TMP0,TMP1 + + ROLD_nd TMP1,9,TMP2,TMP0 + ROLD_nd Wj0,17,TMP2,TMP0 + + vpxor TMP0,TMP1 + vpxor TMP0,Wj0 + + ROLD F,19,TMP2 + + ROTATE_ARGS_EH + + ROTATE_W + vmovdqa Wj4, [YTMPI5] + %assign I (I+1) +%endrep + + vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE] + vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE] + vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE] + vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE] + vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE] + vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE] + vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE] + vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE] + + ; Write back to memory (state object) the transposed digest + vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A + vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B + vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C + vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D + vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E + vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F + vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G + vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H + + sub SIZE, 1 + je last_loop + jmp lloop + +last_loop: + + + ; update input pointers + add inp0, IDX + mov [STATE + _args_data_ptr + 0*8], inp0 + add inp1, IDX + mov [STATE + _args_data_ptr + 1*8], inp1 + add inp2, IDX + mov [STATE + _args_data_ptr + 2*8], inp2 + add inp3, IDX + mov [STATE + _args_data_ptr + 3*8], inp3 + add inp4, IDX + mov [STATE + _args_data_ptr + 4*8], inp4 + add inp5, IDX + mov [STATE + _args_data_ptr + 5*8], inp5 + add inp6, IDX + mov [STATE + _args_data_ptr + 6*8], inp6 + add inp7, IDX + mov [STATE + _args_data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, [rsp + _RSP_SAVE] + ret + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +align 64 +global TABLE +TABLE: + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0x79cc451979cc4519,0x79cc451979cc4519 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xf3988a32f3988a32,0xf3988a32f3988a32 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xe7311465e7311465,0xe7311465e7311465 + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0xce6228cbce6228cb,0xce6228cbce6228cb + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x9cc451979cc45197,0x9cc451979cc45197 + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x3988a32f3988a32f,0x3988a32f3988a32f + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0x7311465e7311465e,0x7311465e7311465e + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0xcc451979cc451979,0xcc451979cc451979 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x988a32f3988a32f3,0x988a32f3988a32f3 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x311465e7311465e7,0x311465e7311465e7 + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0x6228cbce6228cbce,0x6228cbce6228cbce + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0xc451979cc451979c,0xc451979cc451979c + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x88a32f3988a32f39,0x88a32f3988a32f39 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x11465e7311465e73,0x11465e7311465e73 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x228cbce6228cbce6,0x228cbce6228cbce6 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xd43cec53d43cec53,0xd43cec53d43cec53 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x43cec53d43cec53d,0x43cec53d43cec53d + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0xcec53d43cec53d43,0xcec53d43cec53d43 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xec53d43cec53d43c,0xec53d43cec53d43c + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0x53d43cec53d43cec,0x53d43cec53d43cec + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 + +SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b + dq 0x0405060700010203,0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm new file mode 100644 index 000000000..482876539 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm @@ -0,0 +1,81 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +extern sm3_ctx_mgr_init_base +extern sm3_ctx_mgr_submit_base +extern sm3_ctx_mgr_flush_base + +extern sm3_ctx_mgr_init_avx2 +extern sm3_ctx_mgr_submit_avx2 +extern sm3_ctx_mgr_flush_avx2 + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sm3_ctx_mgr_init_avx512 + extern sm3_ctx_mgr_submit_avx512 + extern sm3_ctx_mgr_flush_avx512 +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sm3_ctx_mgr_init +mbin_interface sm3_ctx_mgr_submit +mbin_interface sm3_ctx_mgr_flush + +;; have not imlement see/avx yet +%ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \ + sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_avx2, \ + sm3_ctx_mgr_init_avx512 + mbin_dispatch_init6 sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \ + sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_avx2, \ + sm3_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \ + sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_avx2, \ + sm3_ctx_mgr_flush_avx512 +%else + mbin_dispatch_init sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \ + sm3_ctx_mgr_init_base,sm3_ctx_mgr_init_avx2 + mbin_dispatch_init sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \ + sm3_ctx_mgr_submit_base,sm3_ctx_mgr_submit_avx2 + mbin_dispatch_init sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \ + sm3_ctx_mgr_flush_base,sm3_ctx_mgr_flush_avx2 +%endif + +;;; func core, ver, snum +slversion sm3_ctx_mgr_init, 00, 00, 2300 +slversion sm3_ctx_mgr_submit, 00, 00, 2301 +slversion sm3_ctx_mgr_flush, 00, 00, 2302 + diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c new file mode 100644 index 000000000..be56350b3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c @@ -0,0 +1,207 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define ISAL_UNIT_TEST +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "sm3_mb.h" +#include "endian_helper.h" + +typedef uint32_t digest_sm3[SM3_DIGEST_NWORDS]; + +#define MSGS 2 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "abc"; +static uint8_t msg2[] = "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd"; + +/* small endian */ +static digest_sm3 exp_result_digest1 = { 0x66c7f0f4, 0x62eeedd9, 0xd1f2d46b, 0xdc10e4e2, + 0x4167c487, 0x5cf2f7a2, 0x297da02b, 0x8f4ba8e0 +}; + +/* small endian */ +static digest_sm3 exp_result_digest2 = { 0xdebe9ff9, 0x2275b8a1, 0x38604889, 0xc18e5a4d, + 0x6fdb70e5, 0x387e5765, 0x293dcba3, 0x9c0c5732 +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2 }; + +static uint32_t *exp_result_digest[MSGS] = { + exp_result_digest1, exp_result_digest2 +}; + +int main(void) +{ + SM3_HASH_CTX_MGR *mgr = NULL; + SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sm3_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = exp_result_digest[t]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = exp_result_digest[t]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sm3_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = exp_result_digest[k]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } + } + while (1) { + ctx = sm3_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = exp_result_digest[k]; + checked++; + for (j = 0; j < SM3_DIGEST_NWORDS; j++) { + if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], + byteswap32(good[j])); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sm3 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c new file mode 100644 index 000000000..4c0c54436 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c @@ -0,0 +1,45 @@ +/********************************************************************** + Copyright(c) 2011-2019 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <openssl/evp.h> + +void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest) +{ + EVP_MD_CTX *md_ctx; + const EVP_MD *md; + unsigned int md_len; + + md = EVP_sm3(); + md_ctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(md_ctx, md, NULL); + EVP_DigestUpdate(md_ctx, buf, length); + EVP_DigestFinal_ex(md_ctx, digest, &md_len); + EVP_MD_CTX_free(md_ctx); +} |