diff options
Diffstat (limited to '')
-rw-r--r-- | src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S | 576 |
1 files changed, 576 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S new file mode 100644 index 000000000..975a07c7a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S @@ -0,0 +1,576 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name\() .req q\reg + v\name\() .req v\reg + s\name\() .req s\reg +.endm + + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + job0_tmp .req x10 + job1_tmp .req x11 + job2_tmp .req x12 + job3_tmp .req x13 + const_adr .req x14 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg msg5,5 + declare_var_vector_reg msg6,6 + declare_var_vector_reg msg7,7 + declare_var_vector_reg msg8,8 + declare_var_vector_reg msg9,9 + declare_var_vector_reg msg10,10 + declare_var_vector_reg msg11,11 + declare_var_vector_reg msg12,12 + declare_var_vector_reg msg13,13 + declare_var_vector_reg msg14,14 + declare_var_vector_reg msg15,15 + declare_var_vector_reg msg16,16 + + + declare_var_vector_reg dig_A,24 + declare_var_vector_reg dig_B,25 + declare_var_vector_reg dig_C,26 + declare_var_vector_reg dig_D,27 + declare_var_vector_reg dig_E,28 + declare_var_vector_reg dig_F,29 + declare_var_vector_reg dig_G,30 + declare_var_vector_reg dig_H,31 + + declare_var_vector_reg TT1,17 + declare_var_vector_reg TT2,18 + declare_var_vector_reg SS1,19 + declare_var_vector_reg SS2,20 + declare_var_vector_reg tmp0,21 + declare_var_vector_reg word_pair,23 + declare_var_vector_reg Tj,22 + + +.macro rol32 target:req,reg:req,bit:req + ushr v\target\().4s,v\reg\().4s,32 - \bit + sli v\target\().4s,v\reg\().4s,\bit +.endm + +// round 0-11 +.macro sm3_round_0 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\round\().16b,vmsg\round\().16b + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + + +.macro sm3_round_4 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\wp\().16b,vmsg\wp\().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + +//round 12-15 +.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +// round 16-62 +.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + mov vTT2.16b,vdig_E.16b + orr vTT1.16b,vdig_B.16b,vdig_C.16b + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +//round 63 +.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b + eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b + + ldp qmsg0,qmsg1,[sp,dig_off+ 0] + mov vTT2.16b,vdig_E.16b + ldp qmsg2,qmsg3,[sp,dig_off+ 32] + orr vTT1.16b,vdig_B.16b,vdig_C.16b + ldp qmsg4,qmsg5,[sp,dig_off+ 64] + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + ldp qmsg6,qmsg7,[sp,dig_off+ 96] + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round\().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + //D=C + eor vdig_D.16b,vdig_C.16b,vmsg3.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + eor vdig_C.16b,vdig_C.16b,vmsg2.16b + //B=A + eor vdig_B.16b,vdig_A.16b,vmsg1.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + //A=TT1 + eor vdig_A.16b,vTT1.16b,vmsg0.16b + // H=G + eor vdig_H.16b,vdig_G.16b,vmsg7.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + eor vdig_G.16b,vdig_G.16b,vmsg6.16b + //F = E + eor vdig_F.16b,vdig_E.16b,vmsg5.16b + stp qdig_G,qdig_H,[sp,dig_off+ 96] + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + eor vdig_E.16b, vdig_E.16b, vmsg4.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] +.endm + + .set dig_off , 80 + +#define STACK_SIZE 224 + .global sm3_mb_asimd_x4 + .type sm3_mb_asimd_x4, %function +sm3_mb_asimd_x4: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + //push d8~d15 + ldr job0_data, [job0],64 + stp d8,d9, [sp,16] + ldr job1_data, [job1],64 + stp d10,d11,[sp,32] + ldr job2_data, [job2],64 + stp d12,d13,[sp,48] + ldr job3_data, [job3],64 + stp d14,d15,[sp,64] + ble .exit_func + + mov job0_tmp,job0_digest + mov job1_tmp,job1_digest + mov job2_tmp,job2_digest + mov job3_tmp,job3_digest + //load digests + ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16 + adrp const_adr, .consts + ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16 + add const_adr, const_adr, #:lo12:.consts + ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp] + rev32 vdig_A.16b,vdig_A.16b + ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp] + rev32 vdig_B.16b,vdig_B.16b + ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp] + rev32 vdig_C.16b,vdig_C.16b + ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp] + rev32 vdig_D.16b,vdig_D.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] + stp qdig_G,qdig_H,[sp,dig_off+ 96] + +.start_loop: + ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16 + ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16 + ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16 + ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16 + ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16 + ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16 + ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16 + ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16 + ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16 + ldr qTj,[const_adr] + + sm3_round_0 0, 4 + + ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16 + sm3_round_0 1, 5 + + ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16 + sm3_round_0 2, 6 + ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16 + sm3_round_0 3, 7 + + ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16 + + sm3_round_4 4, 8 + ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16 + sm3_round_4 5, 9 + ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16 + sm3_round_4 6,10 + ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16 + sm3_round_4 7,11 + sm3_round_4 8,12 + sm3_round_4 9,13 + sm3_round_4 10,14 + sm3_round_4 11,15 + + sm3_round_12 12,16, 0, 7,13, 3,10 //12 + sm3_round_12 13, 0, 1, 8,14, 4,11 //13 + sm3_round_12 14, 1, 2, 9,15, 5,12 //14 + sm3_round_12 15, 2, 3,10,16, 6,13 //15 + + ldr qTj,[const_adr,16] + sm3_round_16 16, 3, 4,11, 0, 7,14 //16 +#if 0 + stp sdig_A,sdig_B,[job0_digest] + stp sdig_C,sdig_D,[job0_digest,8] + stp sdig_E,sdig_F,[job0_digest,16] + stp sdig_G,sdig_H,[job0_digest,24] + b .exit_func +#endif + sm3_round_16 0, 4, 5,12, 1, 8,15 //17 + + sm3_round_16 1, 5, 6,13, 2, 9,16 //18 + sm3_round_16 2, 6, 7,14, 3,10, 0 //19 + sm3_round_16 3, 7, 8,15, 4,11, 1 //20 + sm3_round_16 4, 8, 9,16, 5,12, 2 //21 + sm3_round_16 5, 9,10, 0, 6,13, 3 //22 + sm3_round_16 6,10,11, 1, 7,14, 4 //23 + sm3_round_16 7,11,12, 2, 8,15, 5 //24 + sm3_round_16 8,12,13, 3, 9,16, 6 //25 + sm3_round_16 9,13,14, 4,10, 0, 7 //26 + sm3_round_16 10,14,15, 5,11, 1, 8 //27 + sm3_round_16 11,15,16, 6,12, 2, 9 //28 + sm3_round_16 12,16, 0, 7,13, 3,10 //29 + sm3_round_16 13, 0, 1, 8,14, 4,11 //30 + sm3_round_16 14, 1, 2, 9,15, 5,12 //31 + sm3_round_16 15, 2, 3,10,16, 6,13 //32 + sm3_round_16 16, 3, 4,11, 0, 7,14 //33 + sm3_round_16 0, 4, 5,12, 1, 8,15 //34 + sm3_round_16 1, 5, 6,13, 2, 9,16 //35 + sm3_round_16 2, 6, 7,14, 3,10, 0 //36 + sm3_round_16 3, 7, 8,15, 4,11, 1 //37 + sm3_round_16 4, 8, 9,16, 5,12, 2 //38 + sm3_round_16 5, 9,10, 0, 6,13, 3 //39 + sm3_round_16 6,10,11, 1, 7,14, 4 //40 + sm3_round_16 7,11,12, 2, 8,15, 5 //41 + sm3_round_16 8,12,13, 3, 9,16, 6 //42 + sm3_round_16 9,13,14, 4,10, 0, 7 //43 + sm3_round_16 10,14,15, 5,11, 1, 8 //44 + sm3_round_16 11,15,16, 6,12, 2, 9 //45 + sm3_round_16 12,16, 0, 7,13, 3,10 //46 + sm3_round_16 13, 0, 1, 8,14, 4,11 //47 + sm3_round_16 14, 1, 2, 9,15, 5,12 //48 + sm3_round_16 15, 2, 3,10,16, 6,13 //49 + sm3_round_16 16, 3, 4,11, 0, 7,14 //50 + sm3_round_16 0, 4, 5,12, 1, 8,15 //51 + sm3_round_16 1, 5, 6,13, 2, 9,16 //52 + sm3_round_16 2, 6, 7,14, 3,10, 0 //53 + sm3_round_16 3, 7, 8,15, 4,11, 1 //54 + sm3_round_16 4, 8, 9,16, 5,12, 2 //55 + sm3_round_16 5, 9,10, 0, 6,13, 3 //56 + sm3_round_16 6,10,11, 1, 7,14, 4 //57 + sm3_round_16 7,11,12, 2, 8,15, 5 //58 + sm3_round_16 8,12,13, 3, 9,16, 6 //59 + sm3_round_16 9,13,14, 4,10, 0, 7 //60 + sm3_round_16 10,14,15, 5,11, 1, 8 //61 + sm3_round_16 11,15,16, 6,12, 2, 9 //62 + sm3_round_63 12,16, 0, 7,13, 3,10 //63 + + subs len,len,1 + bne .start_loop + + //save digests with big endian + rev32 vdig_A.16b,vdig_A.16b + rev32 vdig_B.16b,vdig_B.16b + rev32 vdig_C.16b,vdig_C.16b + rev32 vdig_D.16b,vdig_D.16b + st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16 + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16 + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16 + st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16 + st4 {vdig_E.s-vdig_H.s}[0],[job0_digest] + st4 {vdig_E.s-vdig_H.s}[1],[job1_digest] + st4 {vdig_E.s-vdig_H.s}[2],[job2_digest] + st4 {vdig_E.s-vdig_H.s}[3],[job3_digest] + +.exit_func: + ldp d8, d9, [sp,16] + ldp d10,d11,[sp,32] + ldp d12,d13,[sp,48] + ldp d14,d15,[sp,64] + ldp x29, x30, [sp], STACK_SIZE + ret +.consts: + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4 + |