diff options
Diffstat (limited to '')
53 files changed, 12431 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am new file mode 100644 index 000000000..3f3c589ad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am @@ -0,0 +1,130 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sha1_mb/sha1_ctx_sse.c \ + sha1_mb/sha1_ctx_avx.c \ + sha1_mb/sha1_ctx_avx2.c \ + sha1_mb/sha1_ctx_base.c + +lsrc_x86_64 += sha1_mb/sha1_mb_mgr_init_sse.c \ + sha1_mb/sha1_mb_mgr_init_avx2.c + +lsrc_x86_64 += sha1_mb/sha1_mb_mgr_submit_sse.asm \ + sha1_mb/sha1_mb_mgr_submit_avx.asm \ + sha1_mb/sha1_mb_mgr_submit_avx2.asm \ + sha1_mb/sha1_mb_mgr_flush_sse.asm \ + sha1_mb/sha1_mb_mgr_flush_avx.asm \ + sha1_mb/sha1_mb_mgr_flush_avx2.asm \ + sha1_mb/sha1_mb_x4_sse.asm \ + sha1_mb/sha1_mb_x4_avx.asm \ + sha1_mb/sha1_mb_x8_avx2.asm \ + sha1_mb/sha1_multibinary.asm + +lsrc_x86_64 += sha1_mb/sha1_ctx_avx512.c \ + sha1_mb/sha1_mb_mgr_init_avx512.c \ + sha1_mb/sha1_mb_mgr_submit_avx512.asm \ + sha1_mb/sha1_mb_mgr_flush_avx512.asm \ + sha1_mb/sha1_mb_x16_avx512.asm + +lsrc_x86_64 += sha1_mb/sha1_opt_x1.asm + +lsrc_x86_64 += sha1_mb/sha1_ni_x1.asm \ + sha1_mb/sha1_ni_x2.asm \ + sha1_mb/sha1_ctx_sse_ni.c \ + sha1_mb/sha1_ctx_avx512_ni.c \ + sha1_mb/sha1_mb_mgr_submit_sse_ni.asm \ + sha1_mb/sha1_mb_mgr_flush_sse_ni.asm \ + sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += sha1_mb/sha1_ctx_base.c \ + sha1_mb/sha1_ref.c \ + sha1_mb/aarch64/sha1_mb_multibinary.S \ + sha1_mb/aarch64/sha1_ctx_ce.c \ + sha1_mb/aarch64/sha1_mb_x1_ce.S \ + sha1_mb/aarch64/sha1_mb_x2_ce.S \ + sha1_mb/aarch64/sha1_mb_mgr_ce.c \ + sha1_mb/aarch64/sha1_ctx_asimd.c \ + sha1_mb/aarch64/sha1_aarch64_x1.S \ + sha1_mb/aarch64/sha1_mb_asimd_x4.S \ + sha1_mb/aarch64/sha1_mb_mgr_asimd.c \ + sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c + + + +lsrc_base_aliases += sha1_mb/sha1_ctx_base_aliases.c \ + sha1_mb/sha1_ctx_base.c \ + sha1_mb/sha1_ref.c + +src_include += -I $(srcdir)/sha1_mb + +extern_hdrs += include/sha1_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + sha1_mb/sha1_job.asm \ + sha1_mb/sha1_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha1_mb/sha1_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha1_mb/sha1_mb_test \ + sha1_mb/sha1_mb_rand_test \ + sha1_mb/sha1_mb_rand_update_test \ + sha1_mb/sha1_mb_flush_test + +unit_tests += sha1_mb/sha1_mb_rand_ssl_test + +perf_tests += sha1_mb/sha1_mb_vs_ossl_perf \ + sha1_mb/sha1_mb_vs_ossl_shortage_perf + +examples += sha1_mb/sha1_multi_buffer_example + + +sha1_mb_rand_test: sha1_ref.o +sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_rand_update_test: sha1_ref.o +sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_flush_test: sha1_ref.o +sha1_mb_sha1_mb_flush_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la + +sha1_mb_rand_ssl_test: LDLIBS += -lcrypto +sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sha1_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sha1_mb_sha1_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S new file mode 100644 index 000000000..55d6f932f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S @@ -0,0 +1,294 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + + input_data .req x0 + num_blocks .req w1 + digest .req x2 + + // x2 is reused intentionally between digest/tmp + // due to running out of registers + TMP .req x2 + TMPW .req w2 + sha1key_adr .req x3 + WK .req w3 + WF .req w4 + WA .req w5 + WB .req w6 + WC .req w7 + WD .req w8 + WE .req w9 + WORD0 .req w10 + WORD1 .req w11 + WORD2 .req w12 + WORD3 .req w13 + WORD4 .req w14 + WORD5 .req w15 + WORD6 .req w16 + WORD7 .req w17 + WORD8 .req w18 + WORD9 .req w19 + WORD10 .req w20 + WORD11 .req w21 + WORD12 .req w22 + WORD13 .req w23 + WORD14 .req w24 + WORD15 .req w25 + AA .req w26 + BB .req w27 + CC .req w28 + DD .req w29 + EE .req w30 + + TT .req w0 + +.macro save_stack + stp x16,x17,[sp, -128]! + stp x18,x19,[sp, 16] + stp x20,x21,[sp, 32] + stp x22,x23,[sp, 48] + stp x24,x25,[sp, 64] + stp x26,x27,[sp, 80] + stp x28,x29,[sp, 96] + str x30,[sp, 112] + // have to reuse x2, which is digest address + str x2,[sp, 120] +.endm + +.macro restore_stack + ldp x18,x19,[sp, 16] + ldp x20,x21,[sp, 32] + ldp x22,x23,[sp, 48] + ldp x24,x25,[sp, 64] + ldp x26,x27,[sp, 80] + ldp x28,x29,[sp, 96] + ldr x30,[sp, 112] + ldr x2,[sp, 120] + ldp x16,x17,[sp],128 +.endm +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor WF, WC, WD + and WF, WB, WF + eor WF, WD, WF +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor WF, WB, WC + eor WF, WF, WD +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and TMPW, WB, WC + and WF, WB, WD + orr WF, WF, TMPW + and TMPW, WC, WD + orr WF, WF, TMPW +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_word_at \windex + .endif +.endm + +.macro SHA1_STEP_00_15 windex:req + rev WORD\windex\(),WORD\windex\() + next_word=\windex+1 + load_next_word %next_word + + ror TMPW,WA,#32-5 + add WE,WE,TMPW + add WE,WE,WK + FUNC_F0 + ror WB,WB,#32-30 + add WE,WE,WORD\windex\() + add WE,WE,WF +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor TMPW,\reg_14,\reg_8 + eor \reg_16,\reg_16,\reg_3 + eor \reg_16,\reg_16,TMPW + + ror TMPW,WA,#32-5 + ror \reg_16,\reg_16, #32 - 1 + + add WE,WE,TMPW + add WE,WE,WK + \func_f + ror WB,WB,#32-30 + add WE,WE,\reg_16 + add WE,WE,WF +.endm + +.macro SWAP_STATES + .unreq TT + TT .req WE + .unreq WE + WE .req WD + .unreq WD + WD .req WC + .unreq WC + WC .req WB + .unreq WB + WB .req WA + .unreq WA + WA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.altmacro + +.macro load_two_words_at idx0:req,idx1:req + ldp WORD\idx0\(),WORD\idx1\(),[input_data],8 +.endm + +.macro load_word_at idx:req + .if \idx % 2 == 0 + idx1=\idx+1 + load_two_words_at \idx,%idx1 + .endif +.endm + +/* + * void sha1_aarch64_x1(uint32_t *input_data, int num_blocks, uint32_t digest[5]) + */ + .global sha1_aarch64_x1 + .type sha1_aarch64_x1, %function +sha1_aarch64_x1: + cmp num_blocks, #0 + beq .return + + ldp WA,WB,[digest] + ldp WC,WD,[digest,8] + ldr WE,[digest,16] + save_stack + +.block_loop: + mov AA, WA + mov BB, WB + mov CC, WC + mov DD, WD + mov EE, WE + + load_word_at 0 + + adr sha1key_adr, KEY_0 + ldr WK, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ldr WK, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ldr WK, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ldr WK, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add WA, AA, WA + add WB, BB, WB + add WC, CC, WC + add WD, DD, WD + add WE, EE, WE + + subs num_blocks, num_blocks, 1 + bne .block_loop + + restore_stack + stp WA,WB,[digest] + stp WC,WD,[digest,8] + str WE,[digest,16] + +.return: + ret + + .size sha1_aarch64_x1, .-sha1_aarch64_x1 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S new file mode 100644 index 000000000..c8b8dd982 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S @@ -0,0 +1,269 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +// macro F = (D ^ (B & (C ^ D))) +.macro FUNC_F0 + eor VF.16b, VC.16b, VD.16b + and VF.16b, VB.16b, VF.16b + eor VF.16b, VD.16b, VF.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F1 + eor VF.16b, VB.16b, VC.16b + eor VF.16b, VF.16b, VD.16b +.endm + +// F = ((B & C) | (B & D) | (C & D)) +.macro FUNC_F2 + and vT0.16b, VB.16b, VC.16b + and vT1.16b, VB.16b, VD.16b + and vT2.16b, VC.16b, VD.16b + orr VF.16b, vT0.16b, vT1.16b + orr VF.16b, VF.16b, vT2.16b +.endm + +// F = (B ^ C ^ D) +.macro FUNC_F3 + FUNC_F1 +.endm + +.altmacro +.macro load_next_word windex + .if \windex < 16 + load_x4_word \windex + .endif +.endm + +// FUNC_F0 is merged into STEP_00_15 for efficiency +.macro SHA1_STEP_00_15_F0 windex:req + rev32 WORD\windex\().16b,WORD\windex\().16b + next_word=\windex+1 + load_next_word %next_word + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, VA.4s, 32 - 5 + add VE.4s, VE.4s, VK.4s + sli VT.4s, VA.4s, 5 + eor VF.16b, VC.16b, VD.16b + add VE.4s, VE.4s, WORD\windex\().4s + and VF.16b, VB.16b, VF.16b + add VE.4s, VE.4s, VT.4s + eor VF.16b, VD.16b, VF.16b + ushr VT.4s, VB.4s, 32 - 30 + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + +.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req + eor vT0.16b,\reg_3\().16b,\reg_8\().16b + eor VT.16b,\reg_14\().16b,\reg_16\().16b + eor vT0.16b,vT0.16b,VT.16b + // e = (a leftrotate 5) + f + e + k + w[i] + ushr VT.4s, vT0.4s, 32 - 1 + add VE.4s, VE.4s, VK.4s + ushr vT1.4s, VA.4s, 32 - 5 + sli VT.4s, vT0.4s, 1 + add VE.4s, VE.4s, VT.4s + sli vT1.4s, VA.4s, 5 + mov \reg_16\().16b,VT.16b + add VE.4s, VE.4s, vT1.4s + ushr VT.4s, VB.4s, 32 - 30 + \func_f + add VE.4s, VE.4s, VF.4s + sli VT.4s, VB.4s, 30 +.endm + + VA .req v0 + VB .req v1 + VC .req v2 + VD .req v3 + VE .req v4 + VT .req v5 + VF .req v6 + VK .req v7 + WORD0 .req v8 + WORD1 .req v9 + WORD2 .req v10 + WORD3 .req v11 + WORD4 .req v12 + WORD5 .req v13 + WORD6 .req v14 + WORD7 .req v15 + WORD8 .req v16 + WORD9 .req v17 + WORD10 .req v18 + WORD11 .req v19 + WORD12 .req v20 + WORD13 .req v21 + WORD14 .req v22 + WORD15 .req v23 + vT0 .req v24 + vT1 .req v25 + vT2 .req v26 + vAA .req v27 + vBB .req v28 + vCC .req v29 + vDD .req v30 + vEE .req v31 + TT .req v0 + sha1key_adr .req x15 + +.macro SWAP_STATES + // shifted VB is held in VT after each step + .unreq TT + TT .req VE + .unreq VE + VE .req VD + .unreq VD + VD .req VC + .unreq VC + VC .req VT + .unreq VT + VT .req VB + .unreq VB + VB .req VA + .unreq VA + VA .req TT +.endm + +.altmacro +.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req + SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\() +.endm + +.macro exec_step windex:req + .if \windex <= 15 + SHA1_STEP_00_15_F0 windex + .else + idx14=((\windex - 14) & 15) + idx8=((\windex - 8) & 15) + idx3=((\windex - 3) & 15) + idx16=(\windex & 15) + .if \windex <= 19 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 20 && \windex <= 39 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 40 && \windex <= 59 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16 + .endif + .if \windex >= 60 && \windex <= 79 + SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16 + .endif + .endif + + SWAP_STATES + + .if \windex == 79 + // after 80 steps, the registers ABCDET has shifted from + // its orignal order of 012345 to 341520 + // have to swap back for both compile- and run-time correctness + mov v0.16b,v3.16b + .unreq VA + VA .req v0 + + mov vT0.16b,v2.16b + mov v2.16b,v1.16b + mov v1.16b,v4.16b + .unreq VB + VB .req v1 + .unreq VC + VC .req v2 + + mov v3.16b,v5.16b + .unreq VD + VD .req v3 + + mov v4.16b,vT0.16b + .unreq VE + VE .req v4 + + .unreq VT + VT .req v5 + .endif +.endm + +.macro exec_steps idx:req,more:vararg + exec_step \idx + .ifnb \more + exec_steps \more + .endif +.endm + +.macro sha1_single + load_x4_word 0 + + mov vAA.16B, VA.16B + mov vBB.16B, VB.16B + mov vCC.16B, VC.16B + mov vDD.16B, VD.16B + mov vEE.16B, VE.16B + + adr sha1key_adr, KEY_0 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + + // 20 ~ 39 + adr sha1key_adr, KEY_1 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 + + // 40 ~ 59 + adr sha1key_adr, KEY_2 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 + + // 60 ~ 79 + adr sha1key_adr, KEY_3 + ld1 {VK.4s}, [sha1key_adr] + exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79 + + add VA.4s, vAA.4s, VA.4s + add VB.4s, vBB.4s, VB.4s + add VC.4s, vCC.4s, VC.4s + add VD.4s, vDD.4s, VD.4s + add VE.4s, vEE.4s, VE.4s +.endm + +.macro sha1_asimd_save_stack + stp d8,d9,[sp, -64]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] + stp d14,d15,[sp, 48] +.endm + +.macro sha1_asimd_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d14,d15,[sp, 48] + ldp d8,d9,[sp],64 +.endm diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c new file mode 100644 index 000000000..9a9952ff6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state); +SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job); +SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_asimd(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_asimd(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_asimd(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_asimd(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_asimd(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_asimd_slver_02020142; +struct slver sha1_ctx_mgr_init_asimd_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_asimd_slver_02020143; +struct slver sha1_ctx_mgr_submit_asimd_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_asimd_slver_02020144; +struct slver sha1_ctx_mgr_flush_asimd_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c new file mode 100644 index 000000000..e40a344ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c @@ -0,0 +1,250 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" +void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state); +SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job); +SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_ce(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_ce(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_ce(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_ce(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_ce(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_ce_slver_02020142; +struct slver sha1_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_ce_slver_02020143; +struct slver sha1_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_ce_slver_02020144; +struct slver sha1_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..0942c1a95 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c @@ -0,0 +1,93 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_submit) +{ + + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_submit_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_submit_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_init_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_init_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA1) + return PROVIDER_INFO(sha1_ctx_mgr_flush_ce); + + if (auxval & HWCAP_ASIMD) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through + case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through + return PROVIDER_INFO(sha1_ctx_mgr_flush_asimd); + default: + break; + } + } + + return PROVIDER_BASIC(sha1_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S new file mode 100644 index 000000000..012b15c14 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S @@ -0,0 +1,192 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8-a + +#include "sha1_asimd_common.S" + +.macro internal_load windex + // load 64-bytes from each address to maximize usage of cache line + .if \windex == 0 + mov tmp,dataptr + ld1 {WORD0.4s},[data0],16 + ld1 {WORD4.4s},[data0],16 + ld1 {WORD8.4s},[data0],16 + ld1 {WORD12.4s},[data0],16 + + ld1 {WORD1.4s},[data1],16 + ld1 {WORD5.4s},[data1],16 + ld1 {WORD9.4s},[data1],16 + ld1 {WORD13.4s},[data1],16 + + ld1 {WORD2.4s},[data2],16 + ld1 {WORD6.4s},[data2],16 + ld1 {WORD10.4s},[data2],16 + ld1 {WORD14.4s},[data2],16 + + ld1 {WORD3.4s},[data3],16 + ld1 {WORD7.4s},[data3],16 + ld1 {WORD11.4s},[data3],16 + ld1 {WORD15.4s},[data3],16 + + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[0],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[1],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[2],[tmp],16 + st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[3],[tmp],16 + .endif + + .if \windex == 4 + mov tmp,dataptr + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[0],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[1],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[2],[tmp],16 + st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[3],[tmp],16 + .endif + + .if \windex == 8 + mov tmp,dataptr + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[0],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[1],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[2],[tmp],16 + st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[3],[tmp],16 + .endif + + .if \windex == 12 + mov tmp,dataptr + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[0],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[1],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[2],[tmp],16 + st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[3],[tmp],16 + .endif +.endm + +.macro load_x4_word idx:req + internal_load \idx + ld1 {WORD\idx\().16b},[dataptr],16 +.endm + +/* + * void sha1_mb_asimd_x4(SHA1_JOB *j0, SHA1_JOB*j1, SHA1_JOB*j2, SHA1_JOB *j3, int blocks) + */ + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + num_blocks .req w4 + tmp .req x5 + data0 .req x6 + data1 .req x7 + data2 .req x8 + data3 .req x9 + databuf .req x10 + dataptr .req x11 + savedsp .req x12 + + .global sha1_mb_asimd_x4 + .type sha1_mb_asimd_x4, %function +sha1_mb_asimd_x4: + cmp num_blocks, #0 + beq .return + sha1_asimd_save_stack + mov savedsp,sp + sub databuf,sp,256 + mov tmp,63 + bic databuf,databuf,tmp + mov sp,databuf + + add tmp,job0,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16 + ld1 {VE.s}[0],[tmp] + ldr data0,[job0] + + add tmp,job1,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16 + ld1 {VE.s}[1],[tmp] + ldr data1,[job1] + + add tmp,job2,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16 + ld1 {VE.s}[2],[tmp] + ldr data2,[job2] + + add tmp,job3,64 + ld4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16 + ld1 {VE.s}[3],[tmp] + ldr data3,[job3] + +.block_loop: + mov dataptr,databuf + sha1_single + subs num_blocks, num_blocks, 1 + bne .block_loop + + add tmp,job0,64 + st4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16 + st1 {VE.s}[0],[tmp] + + add tmp,job1,64 + st4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16 + st1 {VE.s}[1],[tmp] + + add tmp,job2,64 + st4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16 + st1 {VE.s}[2],[tmp] + + add tmp,job3,64 + st4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16 + st1 {VE.s}[3],[tmp] + + mov sp,savedsp + sha1_asimd_restore_stack +.return: + ret + + .size sha1_mb_asimd_x4, .-sha1_mb_asimd_x4 + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +KEY_0: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 +KEY_1: + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 +KEY_2: + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc +KEY_3: + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c new file mode 100644 index 000000000..4b34e7b53 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c @@ -0,0 +1,217 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stddef.h> +#include <sha1_mb.h> +#include <assert.h> +#include "endian_helper.h" + +extern void sha1_aarch64_x1(const uint8_t * data, int num_blocks, uint32_t digest[]); +static inline void sha1_job_x1(SHA1_JOB * job, int blocks) +{ + sha1_aarch64_x1(job->buffer, blocks, job->result_digest); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA1_MB_ASIMD_MAX_LANES 4 +void sha1_mb_asimd_x4(SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) + +void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= SHA1_MB_ASIMD_MAX_LANES - 1 - i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + // lanes > SHA1_MB_ASIMD_MAX_LANES is invalid lane + for (; i < SHA1_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes, blocks; + int lane_idx_array[SHA1_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } + lanes = 0, len = 0; + for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); + blocks = len >> 4; + + /* for less-than-3-lane job, ASIMD really does not have much advantage + * compared to scalar due to wasted >= 50% capacity + * therefore we only run ASIMD for 3/4 lanes of data + */ + if (lanes == SHA1_MB_ASIMD_MAX_LANES) { + sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, blocks); + } else if (lanes == 3) { + /* in case of 3 lanes, apparently ASIMD will still operate as if + * there were four lanes of data in processing (waste 25% capacity) + * theoretically we can let ASIMD implementation know the number of lanes + * so that it could "at least" save some memory loading time + * but in practice, we can just pass lane 0 as dummy for similar + * cache performance + */ + SHA1_JOB dummy; + dummy.buffer = state->ldata[lane_idx_array[0]].job_in_lane->buffer; + dummy.len = state->ldata[lane_idx_array[0]].job_in_lane->len; + sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane, + &dummy, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, blocks); + } else { + sha1_job_x1(state->ldata[lane_idx_array[0]].job_in_lane, blocks); + if (lanes >= 2) { + sha1_job_x1(state->ldata[lane_idx_array[1]].job_in_lane, blocks); + } + } + + // only return the min length job + for (i = 0; i < SHA1_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + return lane_idx; + +} + +static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state) +{ + int i; + SHA1_JOB *ret = NULL; + + for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ + int lane_idx; + // add job into lanes + lane_idx = state->unused_lanes & 0xf; + // fatal error + assert(lane_idx < SHA1_MB_ASIMD_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA1_JOB *ret; + + // add job into lanes + sha1_mb_mgr_insert_job(state, job); + + ret = sha1_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + // submit will wait all lane has data + if (state->num_lanes_inuse < SHA1_MB_ASIMD_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha1_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha1_mb_mgr_do_jobs(state); +#endif + + // ~ i = lane_idx; + ret = sha1_mb_mgr_free_lane(state); + return ret; +} + +SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state) +{ + SHA1_JOB *ret; + ret = sha1_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha1_mb_mgr_do_jobs(state); + return sha1_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c new file mode 100644 index 000000000..1dfd67d0c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c @@ -0,0 +1,208 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stddef.h> +#include <sha1_mb.h> +#include <assert.h> + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA1_MB_CE_MAX_LANES 2 +#if SHA1_MB_CE_MAX_LANES >=2 +void sha1_mb_ce_x2(SHA1_JOB *, SHA1_JOB *, int); +#endif +void sha1_mb_ce_x1(SHA1_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state) +{ + unsigned int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SHA1_MB_CE_MAX_LANES is invalid lane + for (; i < SHA1_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SHA1_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SHA1_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha1_mb_ce_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); + +#if SHA1_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sha1_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sha1_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SHA1_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state) +{ + int i; + SHA1_JOB *ret = NULL; + + for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SHA1_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA1_JOB *ret; + + //add job into lanes + sha1_mb_mgr_insert_job(state, job); + + ret = sha1_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SHA1_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha1_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha1_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sha1_mb_mgr_free_lane(state); + return ret; +} + +SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state) +{ + SHA1_JOB *ret; + ret = sha1_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha1_mb_mgr_do_jobs(state); + return sha1_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S new file mode 100644 index 000000000..bb1929d76 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface sha1_ctx_mgr_submit +mbin_interface sha1_ctx_mgr_init +mbin_interface sha1_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S new file mode 100644 index 000000000..22f736793 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S @@ -0,0 +1,194 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + +/** +maros for round 4-67 +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req + sha1h \e0\()_s, \abcd\()_s + \inst \abcd\()_q,\e1\()_s,\tmp1\()_v.4s + add \tmp1\()_v.4s,\msg3\()_v.4s,\k\()_v.4s + sha1su1 \msg0\()_v.4s,\msg3\()_v.4s + sha1su0 \msg1\()_v.4s,\msg2\()_v.4s,\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key_0,28 + declare_var_vector_reg key_1,29 + declare_var_vector_reg key_2,30 + declare_var_vector_reg key_3,31 + + +/* +digest variables +*/ + declare_var_vector_reg abcd,0 + declare_var_vector_reg e0,1 + declare_var_vector_reg e1,2 + declare_var_vector_reg abcd_saved,3 + declare_var_vector_reg e0_saved,4 +/* +Message variables +*/ + declare_var_vector_reg msg_0,16 + declare_var_vector_reg msg_1,17 + declare_var_vector_reg msg_2,18 + declare_var_vector_reg msg_3,19 +/* +Temporay variables +*/ + declare_var_vector_reg tmp_0,5 + declare_var_vector_reg tmp_1,6 + +/* + void sha1_mb_ce_x1(SHA1_JOB * job, int len); +*/ +/* +Arguements list +*/ + job .req x0 + len .req w1 + data .req x2 + tmp .req x3 + .global sha1_mb_ce_x1 + .type sha1_mb_ce_x1, %function +sha1_mb_ce_x1: + ldr data, [job] + ldr abcd_q, [job, 64] + ldr e0_s, [job, 80] + adr tmp, KEY + ld1 {key_0_v.4s-key_3_v.4s},[tmp] + +start_loop: + + //load msgs + ld1 {msg_0_v.4s-msg_3_v.4s},[data] + + //adjust loop parameter + add data,data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov abcd_saved_v.16b,abcd_v.16b + mov e0_saved_v.16b,e0_v.16b + + rev32 msg_0_v.16b,msg_0_v.16b + rev32 msg_1_v.16b,msg_1_v.16b + add tmp_0_v.4s,msg_0_v.4s,key_0_v.4s + rev32 msg_2_v.16b,msg_2_v.16b + add tmp_1_v.4s,msg_1_v.4s,key_0_v.4s + rev32 msg_3_v.16b,msg_3_v.16b + + /* rounds 0-3 */ + sha1h e1_s,abcd_s + sha1c abcd_q,e0_s,tmp_0_v.4s + add tmp_0_v.4s,msg_2_v.4s,key_0_v.4s + sha1su0 msg_0_v.4s,msg_1_v.4s,msg_2_v.4s + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0 + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3 + + /* rounds 68-71 */ + sha1h e0_s,abcd_s + sha1p abcd_q,e1_s,tmp_1_v.4s + add tmp_1_v.4s,msg_3_v.4s,key_3_v.4s + sha1su1 msg_0_v.4s,msg_3_v.4s + + /* rounds 72-75 */ + sha1h e1_s,abcd_s + sha1p abcd_q,e0_s,tmp_0_v.4s + + /* rounds 76-79 */ + sha1h e0_s,abcd_s + sha1p abcd_q,e1_s,tmp_1_v.4s + + + + add abcd_v.4s,abcd_v.4s,abcd_saved_v.4s + add e0_v.2s,e0_v.2s,e0_saved_v.2s + + + bgt start_loop + str abcd_q, [job, 64] + str e0_s, [job, 80] + + ret + + .size sha1_mb_ce_x1, .-sha1_mb_ce_x1 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S new file mode 100644 index 000000000..93f653ad2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S @@ -0,0 +1,253 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm + +/** +maros for round 4-67 +*/ +.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req + sha1h l0_\e0\()_s, l0_\abcd\()_s + sha1h l1_\e0\()_s, l1_\abcd\()_s + + \inst l0_\abcd\()_q,l0_\e1\()_s,l0_\tmp1\()_v.4s + \inst l1_\abcd\()_q,l1_\e1\()_s,l1_\tmp1\()_v.4s + + add l0_\tmp1\()_v.4s,l0_\msg3\()_v.4s,\k\()_v.4s + add l1_\tmp1\()_v.4s,l1_\msg3\()_v.4s,\k\()_v.4s + + sha1su1 l0_\msg0\()_v.4s,l0_\msg3\()_v.4s + sha1su1 l1_\msg0\()_v.4s,l1_\msg3\()_v.4s + + sha1su0 l0_\msg1\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha1su0 l1_\msg1\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key_0,28 + declare_var_vector_reg key_1,29 + declare_var_vector_reg key_2,30 + declare_var_vector_reg key_3,31 + + +/* +lane variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_e0,1 + declare_var_vector_reg l0_e1,2 + declare_var_vector_reg l0_abcd_saved,3 + declare_var_vector_reg l0_e0_saved,4 + declare_var_vector_reg l0_tmp_0,5 + declare_var_vector_reg l0_tmp_1,6 + declare_var_vector_reg l0_msg_0,16 + declare_var_vector_reg l0_msg_1,17 + declare_var_vector_reg l0_msg_2,18 + declare_var_vector_reg l0_msg_3,19 + + declare_var_vector_reg l1_abcd,7 + declare_var_vector_reg l1_e0,8 + declare_var_vector_reg l1_e1,9 + declare_var_vector_reg l1_abcd_saved,24 + declare_var_vector_reg l1_e0_saved,25 + declare_var_vector_reg l1_tmp_0,26 + declare_var_vector_reg l1_tmp_1,27 + declare_var_vector_reg l1_msg_0,20 + declare_var_vector_reg l1_msg_1,21 + declare_var_vector_reg l1_msg_2,22 + declare_var_vector_reg l1_msg_3,23 + +/* + void sha1_mb_ce_x2(SHA1_JOB * job_0, SHA1_JOB * job_1,int len); +*/ + l0_job .req x0 + l1_job .req x1 + len .req w2 + + l0_data .req x3 + l1_data .req x4 + tmp .req x5 + .global sha1_mb_ce_x2 + .type sha1_mb_ce_x2, %function +sha1_mb_ce_x2: + //push d8,d9 to stack + stp d8, d9, [sp, -256]! + + adr tmp, KEY + ld1 {key_0_v.4s-key_3_v.4s},[tmp] + ldr l0_data, [l0_job] + ldr l1_data, [l1_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_e0_s, [l0_job, 80] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_e0_s, [l1_job, 80] + +start_loop: + + //load msgs + ld1 {l0_msg_0_v.4s-l0_msg_3_v.4s},[l0_data] + ld1 {l1_msg_0_v.4s-l1_msg_3_v.4s},[l1_data] + + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b, l0_abcd_v.16b + mov l0_e0_saved_v.16b, l0_e0_v.16b + mov l1_abcd_saved_v.16b, l1_abcd_v.16b + mov l1_e0_saved_v.16b, l1_e0_v.16b + + rev32 l0_msg_0_v.16b, l0_msg_0_v.16b + rev32 l0_msg_1_v.16b, l0_msg_1_v.16b + add l0_tmp_0_v.4s, l0_msg_0_v.4s, key_0_v.4s + rev32 l0_msg_2_v.16b, l0_msg_2_v.16b + add l0_tmp_1_v.4s, l0_msg_1_v.4s, key_0_v.4s + rev32 l0_msg_3_v.16b, l0_msg_3_v.16b + + rev32 l1_msg_0_v.16b, l1_msg_0_v.16b + rev32 l1_msg_1_v.16b, l1_msg_1_v.16b + add l1_tmp_0_v.4s, l1_msg_0_v.4s, key_0_v.4s + rev32 l1_msg_2_v.16b, l1_msg_2_v.16b + add l1_tmp_1_v.4s, l1_msg_1_v.4s, key_0_v.4s + rev32 l1_msg_3_v.16b, l1_msg_3_v.16b + + /* rounds 0-3 */ + sha1h l0_e1_s, l0_abcd_s + sha1c l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s + add l0_tmp_0_v.4s, l0_msg_2_v.4s, key_0_v.4s + sha1su0 l0_msg_0_v.4s, l0_msg_1_v.4s, l0_msg_2_v.4s + + sha1h l1_e1_s, l1_abcd_s + sha1c l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s + add l1_tmp_0_v.4s, l1_msg_2_v.4s, key_0_v.4s + sha1su0 l1_msg_0_v.4s, l1_msg_1_v.4s, l1_msg_2_v.4s + + sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */ + sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0 + sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */ + sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */ + sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2 + sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2 + sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */ + sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3 + sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3 + sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3 + + /* rounds 68-71 */ + sha1h l0_e0_s, l0_abcd_s + sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s + add l0_tmp_1_v.4s, l0_msg_3_v.4s, key_3_v.4s + sha1su1 l0_msg_0_v.4s, l0_msg_3_v.4s + + sha1h l1_e0_s, l1_abcd_s + sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s + add l1_tmp_1_v.4s, l1_msg_3_v.4s, key_3_v.4s + sha1su1 l1_msg_0_v.4s, l1_msg_3_v.4s + + /* rounds 72-75 */ + sha1h l0_e1_s, l0_abcd_s + sha1p l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s + + sha1h l1_e1_s, l1_abcd_s + sha1p l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s + + /* rounds 76-79 */ + sha1h l0_e0_s, l0_abcd_s + sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s + + sha1h l1_e0_s, l1_abcd_s + sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s + + + + add l0_abcd_v.4s, l0_abcd_v.4s, l0_abcd_saved_v.4s + add l0_e0_v.2s, l0_e0_v.2s, l0_e0_saved_v.2s + add l1_abcd_v.4s, l1_abcd_v.4s, l1_abcd_saved_v.4s + add l1_e0_v.2s, l1_e0_v.2s, l1_e0_saved_v.2s + + + + + bgt start_loop + + str l0_abcd_q, [l0_job, 64] + str l0_e0_s, [l0_job, 80] + + + str l1_abcd_q, [l1_job, 64] + str l1_e0_s, [l1_job, 80] + + //pop d8,d9 from stack + ldp d8, d9, [sp], 256 + ret + + .size sha1_mb_ce_x2, .-sha1_mb_ce_x2 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x5a827999 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x6ed9eba1 + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0x8f1bbcdc + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 + .word 0xca62c1d6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c new file mode 100644 index 000000000..ad91d64ac --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c @@ -0,0 +1,265 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx_slver_02020142; +struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_submit_avx_slver_02020143; +struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha1_ctx_mgr_flush_avx_slver_02020144; +struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c new file mode 100644 index 000000000..85977d4c2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c @@ -0,0 +1,264 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx2_slver_04020145; +struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 }; + +struct slver sha1_ctx_mgr_submit_avx2_slver_04020146; +struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 }; + +struct slver sha1_ctx_mgr_flush_avx2_slver_04020147; +struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c new file mode 100644 index 000000000..90e087163 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c @@ -0,0 +1,271 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx512_slver_0600014a; +struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 }; + +struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b; +struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 }; + +struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c; +struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c new file mode 100644 index 000000000..2013f829a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c @@ -0,0 +1,281 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +/** + * sha1_ctx_avx512_ni related functions are aiming to utilize Canon Lake. + * Since SHANI is still slower than multibuffer for full lanes, + * sha1_ctx_mgr_init_avx512_ni and sha1_ctx_mgr_submit_avx512_ni are + * similar with their avx512 versions. + * sha1_ctx_mgr_flush_avx512_ni is different. It will call + * sha1_mb_mgr_flush_avx512_ni which would use shani when lanes are less + * than a threshold. + * + */ +#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI) + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_avx512_ni(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512_ni(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_avx512_ni_slver_080002c4; +struct slver sha1_ctx_mgr_init_avx512_ni_slver = { 0x02c4, 0x00, 0x08 }; + +struct slver sha1_ctx_mgr_submit_avx512_ni_slver_080002c5; +struct slver sha1_ctx_mgr_submit_avx512_ni_slver = { 0x02c5, 0x00, 0x08 }; + +struct slver sha1_ctx_mgr_flush_avx512_ni_slver_080002c6; +struct slver sha1_ctx_mgr_flush_avx512_ni_slver = { 0x02c6, 0x00, 0x08 }; + +#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c new file mode 100644 index 000000000..90481efd0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c @@ -0,0 +1,325 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (b ^ c ^ d) +#define F3(b,c,d) ((b & c) | (d & (b | c))) +#define F4(b,c,d) (b ^ c ^ d) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sha1_single(const void *data, uint32_t digest[]); +static inline void hash_init_digest(SHA1_WORD_T * digest); + +void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr) +{ +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + sha1_init(ctx, buffer, len); + sha1_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + sha1_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sha1_update(ctx, buffer, len); + sha1_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sha1_init(ctx, buffer, len); + remain_len = sha1_update(ctx, buffer, len); + sha1_final(ctx, remain_len); + } + + return ctx; +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SHA1_BLOCK_SIZE) { + sha1_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SHA1_BLOCK_SIZE); + remain_len -= SHA1_BLOCK_SIZE; + ctx->total_length += SHA1_BLOCK_SIZE; + } + + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sha1_single(buf, digest); + if (i == 2 * SHA1_BLOCK_SIZE) { + sha1_single(buf + SHA1_BLOCK_SIZE, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +void sha1_single(const void *data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +struct slver sha1_ctx_mgr_init_base_slver_00000192; +struct slver sha1_ctx_mgr_init_base_slver = { 0x0192, 0x00, 0x00 }; + +struct slver sha1_ctx_mgr_submit_base_slver_00000193; +struct slver sha1_ctx_mgr_submit_base_slver = { 0x0193, 0x00, 0x00 }; + +struct slver sha1_ctx_mgr_flush_base_slver_00000194; +struct slver sha1_ctx_mgr_flush_base_slver = { 0x0194, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c new file mode 100644 index 000000000..32eb07f6e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stdint.h> +#include <string.h> +#include "sha1_mb.h" +#include "memcpy_inline.h" + +extern void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr); +extern SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags); +extern SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr); + +void sha1_ctx_mgr_init(SHA1_HASH_CTX_MGR * mgr) +{ + return sha1_ctx_mgr_init_base(mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sha1_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush(SHA1_HASH_CTX_MGR * mgr) +{ + return sha1_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c new file mode 100644 index 000000000..db70ee015 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c @@ -0,0 +1,251 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr) +{ + sha1_mb_mgr_init_sse(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_sse_slver_00020139; +struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 }; + +struct slver sha1_ctx_mgr_submit_sse_slver_00020140; +struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 }; + +struct slver sha1_ctx_mgr_flush_sse_slver_00020141; +struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c new file mode 100644 index 000000000..d3c7687d2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c @@ -0,0 +1,259 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_SHANI + +static inline void hash_init_digest(SHA1_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len); +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx); + +void sha1_ctx_mgr_init_sse_ni(SHA1_HASH_CTX_MGR * mgr) +{ + // Same with sse + sha1_mb_mgr_init_sse(&mgr->mgr); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse_ni(SHA1_HASH_CTX_MGR * mgr) +{ + SHA1_HASH_CTX *ctx; + + while (1) { + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA1_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA1_WORD_T * digest) +{ + static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] = + { SHA1_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 + + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha1_ctx_mgr_init_sse_ni_slver_070002c1; +struct slver sha1_ctx_mgr_init_sse_ni_slver = { 0x02c1, 0x00, 0x07 }; + +struct slver sha1_ctx_mgr_submit_sse_ni_slver_070002c2; +struct slver sha1_ctx_mgr_submit_sse_ni_slver = { 0x02c2, 0x00, 0x07 }; + +struct slver sha1_ctx_mgr_flush_sse_ni_slver_070002c3; +struct slver sha1_ctx_mgr_flush_sse_ni_slver = { 0x02c3, 0x00, 0x07 }; + +#endif // HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm new file mode 100644 index 000000000..1c9a66fd4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm @@ -0,0 +1,67 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SHA1_SB_THRESHOLD_SSE 1 +%define SHA1_SB_THRESHOLD_AVX 1 +%define SHA1_SB_THRESHOLD_AVX2 1 +%define SHA1_SB_THRESHOLD_AVX512 1 +%define SHA1_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha1_mb +%define SHA1_NI_SB_THRESHOLD_AVX512 6 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA1_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA1_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 4, 4 ; length in bytes +FIELD _result_digest, 5*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +%assign _SHA1_JOB_size _FIELD_OFFSET +%assign _SHA1_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c new file mode 100644 index 000000000..4bf2e09b5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c @@ -0,0 +1,146 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SHA1_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SHA1_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SHA1_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SHA1_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sha1_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SHA1_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sha1_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sha1_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else if (num_unchanged) + printf("SHA-NI is used when %d or %d jobs are uncompleted\n", + num_unchanged, num_unchanged + 1); + else + printf("SHA-NI is not used, or used for last job\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm new file mode 100644 index 000000000..21c81403b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm @@ -0,0 +1,74 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA1 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA1_ARGS_X16 +;;; name size align +FIELD _digest, 4*5*16, 16 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA1_ARGS_X4_size _FIELD_OFFSET +%assign _SHA1_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA1_ARGS_X8_size _FIELD_OFFSET +%assign _SHA1_ARGS_X8_align _STRUCT_ALIGN +%assign _SHA1_ARGS_X16_size _FIELD_OFFSET +%assign _SHA1_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..c5fd71300 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm @@ -0,0 +1,247 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_avx +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx, function +sha1_mb_mgr_flush_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..a47ae2838 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm @@ -0,0 +1,273 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x8_avx2 +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha1_mb_x8_avx2 and sha1_opt_x1 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx2, function +sha1_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX2 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*32] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..5e3db5b9b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm @@ -0,0 +1,271 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha1_mb_x16_avx512 +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged +%define job arg2 ; unused +%define len2 arg2 ; unused + +; idx must be a register not clobberred by sha1_mb_x16_avx512 +%define idx rbp ; unchanged + +%define unused_lanes rbx ; covered +%define lane_data rbx ; covered +%define tmp2 rbx ; covered + +%define num_lanes_inuse r9 ; covered + +%define job_rax rax ; covered +%define tmp rax ; unused + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx512, function +sha1_mb_mgr_flush_avx512: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_mgr_flush_avx512 +no_sha1_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm new file mode 100644 index 000000000..4170b6c73 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm @@ -0,0 +1,278 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + +extern sha1_mb_x16_avx512 +extern sha1_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged +%define job arg2 ; unused +%define len2 arg2 ; unused + +; idx must be a register not clobberred by sha1_mb_x16_avx512 +%define idx rbp ; unchanged + +%define unused_lanes rbx ; covered +%define lane_data rbx ; covered +%define tmp2 rbx ; covered + +%define num_lanes_inuse r9 ; covered + +%define job_rax rax ; covered +%define tmp rax ; unused + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_avx512_ni, function +sha1_mb_mgr_flush_avx512_ni: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + + %else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_avx512_ni + no_sha1_mb_mgr_flush_avx512_ni: + %endif + %endif ; HAVE_AS_KNOWS_SHANI +%else +%ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_avx512_ni + no_sha1_mb_mgr_flush_avx512_ni: + %endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..2a4c4b50a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm @@ -0,0 +1,249 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_sse +extern sha1_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_sse, function +sha1_mb_mgr_flush_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm new file mode 100644 index 000000000..ea3cffd33 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm @@ -0,0 +1,256 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha1_mb_x4_sse +extern sha1_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG1, ARG2, rax, r8-r11 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*2 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA1_JOB* sha1_mb_mgr_flush_sse_ni(SHA1_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha1_mb_mgr_flush_sse_ni, function +sha1_mb_mgr_flush_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp2) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_flush_sse_ni + no_sha1_mb_mgr_flush_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c new file mode 100644 index 000000000..b6124486a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c new file mode 100644 index 000000000..033fb3c9f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c new file mode 100644 index 000000000..811c4a9dd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha1_mb.h" + +void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF3210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA1_MIN_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..49c018138 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_avx + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be in a register not clobberred by sha1_mult +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be in a register not clobberred by sha1_mult +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*4 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx, function +sha1_mb_mgr_submit_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*4 + 16*0], xmm6 + vmovdqa [rsp + 8*4 + 16*1], xmm7 + vmovdqa [rsp + 8*4 + 16*2], xmm8 + vmovdqa [rsp + 8*4 + 16*3], xmm9 + vmovdqa [rsp + 8*4 + 16*4], xmm10 + vmovdqa [rsp + 8*4 + 16*5], xmm11 + vmovdqa [rsp + 8*4 + 16*6], xmm12 + vmovdqa [rsp + 8*4 + 16*7], xmm13 + vmovdqa [rsp + 8*4 + 16*8], xmm14 + vmovdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*4 + 16*0] + vmovdqa xmm7, [rsp + 8*4 + 16*1] + vmovdqa xmm8, [rsp + 8*4 + 16*2] + vmovdqa xmm9, [rsp + 8*4 + 16*3] + vmovdqa xmm10, [rsp + 8*4 + 16*4] + vmovdqa xmm11, [rsp + 8*4 + 16*5] + vmovdqa xmm12, [rsp + 8*4 + 16*6] + vmovdqa xmm13, [rsp + 8*4 + 16*7] + vmovdqa xmm14, [rsp + 8*4 + 16*8] + vmovdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..95b4f1715 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm @@ -0,0 +1,250 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "memcpy.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%define extra_blocks rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%define extra_blocks rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +; idx must be a register not clobberred by sha1_x8_avx2 +%define idx r8 +%define last_len r8 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx2, function +sha1_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + + shl len,4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + + vmovd [state + _args_digest + 4*lane + 0*32], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*32], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*32] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..a4f9389a1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm @@ -0,0 +1,248 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "memcpy.asm" +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha1_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common definitions and latter-state(unused,covered,unchanged) +%define state arg1 ; unchanged, mb_x16's input1 +%define job arg2 ; arg2 unused +%define len2 arg2 ; arg2 unused, mb_x16's input2 + +; idx must be a register not clobberred by sha1_x16_avx512 +%define idx r8 ; unchanged + +%define p r11 ; unused + +%define unused_lanes rbx ; covered + +%define job_rax rax ; covered +%define len rax ; unused + +%define lane rbp ; unused + +%define tmp r9 ; covered +%define num_lanes_inuse r9 ; covered + +%define lane_data r10 ; covered + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_avx512, function +sha1_mb_mgr_submit_avx512: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + + shl len,4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + + vmovd [state + _args_digest + 4*lane + 0*64], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*64], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF ; idx represent min length index + shr len2, 4 ; size in blocks + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*64] + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_mgr_submit_avx512 +no_sha1_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..9989a9a1d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha1_mb_x4_sse + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*4 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_sse, function +sha1_mb_mgr_submit_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + movdqa [rsp + 8*4 + 16*0], xmm6 + movdqa [rsp + 8*4 + 16*1], xmm7 + movdqa [rsp + 8*4 + 16*2], xmm8 + movdqa [rsp + 8*4 + 16*3], xmm9 + movdqa [rsp + 8*4 + 16*4], xmm10 + movdqa [rsp + 8*4 + 16*5], xmm11 + movdqa [rsp + 8*4 + 16*6], xmm12 + movdqa [rsp + 8*4 + 16*7], xmm13 + movdqa [rsp + 8*4 + 16*8], xmm14 + movdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*4 + 16*0] + movdqa xmm7, [rsp + 8*4 + 16*1] + movdqa xmm8, [rsp + 8*4 + 16*2] + movdqa xmm9, [rsp + 8*4 + 16*3] + movdqa xmm10, [rsp + 8*4 + 16*4] + movdqa xmm11, [rsp + 8*4 + 16*5] + movdqa xmm12, [rsp + 8*4 + 16*6] + movdqa xmm13, [rsp + 8*4 + 16*7] + movdqa xmm14, [rsp + 8*4 + 16*8] + movdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm new file mode 100644 index 000000000..979324de4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm @@ -0,0 +1,290 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_job.asm" +%include "sha1_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha1_mb_x4_sse +extern sha1_ni_x2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%else +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi + +; idx needs to be other than ARG2, rax, r8-r11 +%define last_len rdx +%define idx rdx + +%define size_offset rcx +%define tmp2 rcx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*6 + 16*10 + 8 + +; SHA1_JOB* sha1_mb_mgr_submit_sse_ni(SHA1_MB_JOB_MGR *state, SHA1_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha1_mb_mgr_submit_sse_ni, function +sha1_mb_mgr_submit_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + movdqa [rsp + 8*4 + 16*0], xmm6 + movdqa [rsp + 8*4 + 16*1], xmm7 + movdqa [rsp + 8*4 + 16*2], xmm8 + movdqa [rsp + 8*4 + 16*3], xmm9 + movdqa [rsp + 8*4 + 16*4], xmm10 + movdqa [rsp + 8*4 + 16*5], xmm11 + movdqa [rsp + 8*4 + 16*6], xmm12 + movdqa [rsp + 8*4 + 16*7], xmm13 + movdqa [rsp + 8*4 + 16*8], xmm14 + movdqa [rsp + 8*4 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + mov DWORD(tmp), [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp) + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + + cmp unused_lanes, 0xF32 ; we will process two jobs at the same time + jne return_null ; wait for another sha_ni job + + ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func + %if SHA1_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb + ; shani glue code + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + ; lensN-len2=idx + sub lens0, len2 + sub lens1, len2 + + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha1_ni_x2 + ; state and idx are intact + + %else + ; original mb code + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mb_x4_sse + ; state and idx are intact + %endif + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16] + + movdqa [job_rax + _result_digest + 0*16], xmm0 + mov [job_rax + _result_digest + 1*16], DWORD(tmp) + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*4 + 16*0] + movdqa xmm7, [rsp + 8*4 + 16*1] + movdqa xmm8, [rsp + 8*4 + 16*2] + movdqa xmm9, [rsp + 8*4 + 16*3] + movdqa xmm10, [rsp + 8*4 + 16*4] + movdqa xmm11, [rsp + 8*4 + 16*5] + movdqa xmm12, [rsp + 8*4 + 16*6] + movdqa xmm13, [rsp + 8*4 + 16*7] + movdqa xmm14, [rsp + 8*4 + 16*8] + movdqa xmm15, [rsp + 8*4 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x67452301 +H1: dd 0xefcdab89 +H2: dd 0x98badcfe +H3: dd 0x10325476 +H4: dd 0xc3d2e1f0 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha1_mb_mgr_submit_sse_ni + no_sha1_mb_mgr_submit_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c new file mode 100644 index 000000000..3925a6f4b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c @@ -0,0 +1,159 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha1_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA1(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c new file mode 100644 index 000000000..4eeeaba0a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c @@ -0,0 +1,202 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha1_mb test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha1_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha1_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha1 test + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha1_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c new file mode 100644 index 000000000..aaa52a0ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c @@ -0,0 +1,297 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha1_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA1_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA1_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS]; + +extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha1_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha1 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha1_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha1_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha1_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha1_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha1_ctx_mgr_init(mgr); + + // Run sha1_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA1_BLOCK_SIZE + + SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA1_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha1_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha1_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA1_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha1_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c new file mode 100644 index 000000000..6261bbf44 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c @@ -0,0 +1,233 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "sha1_mb.h" +#include "endian_helper.h" + +typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS]; + +#define MSGS 7 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS +static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +static DigestSHA1 expResultDigest1 = + { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 }; + +static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +static DigestSHA1 expResultDigest2 = + { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D }; + +static uint8_t msg3[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<"; +static DigestSHA1 expResultDigest3 = + { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 }; + +static uint8_t msg4[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +static DigestSHA1 expResultDigest4 = + { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 }; + +static uint8_t msg5[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?"; +static DigestSHA1 expResultDigest5 = + { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 }; + +static uint8_t msg6[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +static DigestSHA1 expResultDigest6 = + { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 }; + +static uint8_t msg7[] = ""; +static DigestSHA1 expResultDigest7 = + { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 }; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7 +}; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha1_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sha1_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sha1_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha1_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha1 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c new file mode 100644 index 000000000..bd8e5e527 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c @@ -0,0 +1,128 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha1_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha1_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha1_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha1_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha1" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha1 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..0b4438d53 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,132 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha1_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SHA1_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS]; + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha1_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA1(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha1_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sha1_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha1" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SHA1_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sha1 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha1_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm new file mode 100644 index 000000000..d64ffe2bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm @@ -0,0 +1,563 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;; code to compute oct SHA1 using AVX-512 +;; outer calling routine takes care of save and restore of XMM registers + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define KT zmm5 +%define AA zmm6 +%define BB zmm7 +%define CC zmm8 +%define DD zmm9 +%define EE zmm10 +%define TMP0 zmm11 +%define TMP1 zmm12 +%define TMP2 zmm13 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%F_IMMED %2 + + ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt + ; E=D, D=C, C=ROTL_30(B), B=A, A=T + + ; Ft + ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) + ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D + ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) + + vmovdqa32 TMP1, B ; Copy B + vpaddd E, E, %%WT ; E = E + Wt + vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) + vpaddd E, E, KT ; E = E + Wt + Kt + vprold TMP0, A, 5 ; TMP0 = ROTL_5(A) + vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt + vprold B, B, 30 ; B = ROTL_30(B) + vpaddd E, E, TMP0 ; E = T + + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79 4 +%define %%WT %1 +%define %%WTp2 %2 +%define %%WTp8 %3 +%define %%WTp13 %4 + ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) + ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) + vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 + vpxord %%WT, %%WT, %%WTp13 + vprold %%WT, %%WT, 1 +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +align 64 + +; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha1_mb_x16_avx512) +sha1_mb_x16_avx512: + endbranch + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + + xor IDX, IDX + + ;; transpose input onto stack + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + vmovups W0,[inp0+IDX] + vmovups W1,[inp1+IDX] + vmovups W2,[inp2+IDX] + vmovups W3,[inp3+IDX] + vmovups W4,[inp4+IDX] + vmovups W5,[inp5+IDX] + vmovups W6,[inp6+IDX] + vmovups W7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups W8, [inp0+IDX] + vmovups W9, [inp1+IDX] + vmovups W10,[inp2+IDX] + vmovups W11,[inp3+IDX] + vmovups W12,[inp4+IDX] + vmovups W13,[inp5+IDX] + vmovups W14,[inp6+IDX] + vmovups W15,[inp7+IDX] + +lloop: + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + add IDX, 64 + + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + +%assign I 0 +%rep 16 + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + + ; Save digests for later addition + vmovdqa32 AA, A + vmovdqa32 BB, B + vmovdqa32 CC, C + vmovdqa32 DD, D + vmovdqa32 EE, E + + vmovdqa32 KT, [K00_19] +%assign I 0xCA +%assign J 0 +%assign K 2 +%assign L 8 +%assign M 13 +%assign N 0 +%rep 64 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %if N = 19 + vmovdqa32 KT, [K20_39] + %assign I 0x96 + %elif N = 39 + vmovdqa32 KT, [K40_59] + %assign I 0xE8 + %elif N = 59 + vmovdqa32 KT, [K60_79] + %assign I 0x96 + %endif +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%assign N (N+1) +%endrep + + ; Check if this is the last block + sub SIZE, 1 + je lastLoop + +%assign I 0x96 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_00_15 APPEND(W,J), J +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + jmp lloop + +lastLoop: +; Need to reset argument rotation values to Round 64 values +%xdefine TMP_ A +%xdefine A B +%xdefine B C +%xdefine C D +%xdefine D E +%xdefine E TMP_ + + ; Process last 16 rounds +%assign I 0x96 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + ; Write out digest + ; Do we need to untranspose digests??? + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + + ret + +section .data +align 64 +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_mb_x16_avx512 +no_sha1_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm new file mode 100644 index 000000000..eb67309da --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm @@ -0,0 +1,416 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA1 using AVX +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = ((B & C) | ((~ B) & D) ) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpand %%regF, %%regB,%%regC + vpandn %%regT, %%regB,%%regD + vpor %%regF, %%regT,%%regF +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define XMM_SAVE ((15-15)*16 + 1*8) +%define FRAMESZ 16*16 + XMM_SAVE +%define _XMM FRAMESZ - XMM_SAVE + +%define VMOVPS vmovups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (4*5*4) + +;%ifdef LINUX +%ifidn __OUTPUT_FORMAT__, elf64 + %define ARG1 rdi + %define ARG2 rsi +%else + ; Windows + %define ARG1 rcx + %define ARG2 rdx +%endif + +align 32 + +; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks); +; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used) +; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1 +; +; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15 +; +mk_global sha1_mb_x4_avx, function, internal +sha1_mb_x4_avx: + endbranch + + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8 + + ;; Initialize digests + vmovdqa A, [ARG1 + 0*16] + vmovdqa B, [ARG1 + 1*16] + vmovdqa C, [ARG1 + 2*16] + vmovdqa D, [ARG1 + 3*16] + vmovdqa E, [ARG1 + 4*16] + + ;; load input pointers + mov inp0,[ARG1 + _data_ptr + 0*8] + mov inp1,[ARG1 + _data_ptr + 1*8] + mov inp2,[ARG1 + _data_ptr + 2*8] + mov inp3,[ARG1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + VMOVPS T2,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T4,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vpshufb T0, T0, F + vmovdqa [rsp+(I*4+0)*16],T0 + vpshufb T1, T1, F + vmovdqa [rsp+(I*4+1)*16],T1 + vpshufb T2, T2, F + vmovdqa [rsp+(I*4+2)*16],T2 + vpshufb T3, T3, F + vmovdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E + +;; +;; perform 0-79 steps +;; + vmovdqa K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [rsp + ((16 - 16) & 15) * 16] + vmovdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqa K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqa K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqa K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub ARG2, 1 + jne lloop + + ; write out digests + vmovdqa [ARG1 + 0*16], A + vmovdqa [ARG1 + 1*16], B + vmovdqa [ARG1 + 2*16], C + vmovdqa [ARG1 + 3*16], D + vmovdqa [ARG1 + 4*16], E + + ; update input pointers + add inp0, IDX + mov [ARG1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [ARG1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [ARG1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [ARG1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm new file mode 100644 index 000000000..5677dce73 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm @@ -0,0 +1,413 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA1 using SSE +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + paddd %%regE,[rsp + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [rsp + ((%%memW - 8) & 15) * 16] + pxor W16, [rsp + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define XMM_SAVE ((15-15)*16 + 1*8) +%define FRAMESZ 16*16 + XMM_SAVE +%define _XMM FRAMESZ - XMM_SAVE + +%define MOVPS movups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (4*5*4) + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define ARG1 rdi + %define ARG2 rsi +%else + ; Windows + %define ARG1 rcx + %define ARG2 rdx +%endif + +align 32 + +; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks); +; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used) +; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1 +; +; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15 +; +mk_global sha1_mb_x4_sse, function, internal +sha1_mb_x4_sse: + endbranch + + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8 + + ;; Initialize digests + movdqa A, [ARG1 + 0*16] + movdqa B, [ARG1 + 1*16] + movdqa C, [ARG1 + 2*16] + movdqa D, [ARG1 + 3*16] + movdqa E, [ARG1 + 4*16] + + ;; load input pointers + mov inp0,[ARG1 + _data_ptr + 0*8] + mov inp1,[ARG1 + _data_ptr + 1*8] + mov inp2,[ARG1 + _data_ptr + 2*8] + mov inp3,[ARG1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + movdqa F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + MOVPS T2,[inp0+IDX] + MOVPS T1,[inp1+IDX] + MOVPS T4,[inp2+IDX] + MOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + pshufb T0, F + movdqa [rsp+(I*4+0)*16],T0 + pshufb T1, F + movdqa [rsp+(I*4+1)*16],T1 + pshufb T2, F + movdqa [rsp+(I*4+2)*16],T2 + pshufb T3, F + movdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E + +;; +;; perform 0-79 steps +;; + movdqa K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + movdqa W16, [rsp + ((16 - 16) & 15) * 16] + movdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + movdqa K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + movdqa K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + movdqa K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + paddd A,AA + paddd B,BB + paddd C,CC + paddd D,DD + paddd E,EE + + sub ARG2, 1 + jne lloop + + ; write out digests + movdqa [ARG1 + 0*16], A + movdqa [ARG1 + 1*16], B + movdqa [ARG1 + 2*16], C + movdqa [ARG1 + 3*16], D + movdqa [ARG1 + 4*16], E + + ; update input pointers + add inp0, IDX + mov [ARG1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [ARG1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [ARG1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [ARG1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + + +section .data align=16 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm new file mode 100644 index 000000000..edcba6d3f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm @@ -0,0 +1,518 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SHA1 using SSE-256 +;; outer calling routine takes care of save and restore of XMM registers + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + +;; +;; Magic functions defined in FIPS 180-1 +;; +;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; F0 = ((B & C) | ((~B) & D)) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpand %%regF, %%regB,%%regC + vpandn %%regT, %%regB,%%regD + vpor %%regF, %%regT,%%regF +%endmacro + +;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + + + +;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-%%imm) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 32)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqu W14, [rsp + ((%%memW - 14) & 15) * 32] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqu [rsp + ((%%memW - 0) & 15) * 32],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define YMM_SAVE (15-15)*32 +%define FRAMESZ 32*16 + 0*8 + YMM_SAVE +%define _YMM FRAMESZ - YMM_SAVE + +%define VMOVPS vmovups + +%define IDX rax +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define inp7 rcx + %define arg1 rdi + %define arg2 rsi + %define RSP_SAVE rdx +%else + %define inp7 rdi + %define arg1 rcx + %define arg2 rdx + %define RSP_SAVE rsi +%endif + + +; ymm0 A +; ymm1 B +; ymm2 C +; ymm3 D +; ymm4 E +; ymm5 F AA +; ymm6 T0 BB +; ymm7 T1 CC +; ymm8 T2 DD +; ymm9 T3 EE +; ymm10 T4 TMP +; ymm11 T5 FUN +; ymm12 T6 K +; ymm13 T7 W14 +; ymm14 T8 W15 +; ymm15 T9 W16 + +%define A ymm0 +%define B ymm1 +%define C ymm2 +%define D ymm3 +%define E ymm4 + +%define F ymm5 +%define T0 ymm6 +%define T1 ymm7 +%define T2 ymm8 +%define T3 ymm9 +%define T4 ymm10 +%define T5 ymm11 +%define T6 ymm12 +%define T7 ymm13 +%define T8 ymm14 +%define T9 ymm15 + +%define AA ymm5 +%define BB ymm6 +%define CC ymm7 +%define DD ymm8 +%define EE ymm9 +%define TMP ymm10 +%define FUN ymm11 +%define K ymm12 +%define W14 ymm13 +%define W15 ymm14 +%define W16 ymm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +%define DIGEST_SIZE (8*5*4) ; 8 streams x 5 32bit words per digest x 4 bytes per word + +align 32 + +; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +mk_global sha1_mb_x8_avx2, function, internal +sha1_mb_x8_avx2: + endbranch + + push RSP_SAVE + + ; save rsp + mov RSP_SAVE, rsp + sub rsp, FRAMESZ ;; FRAMESZ + pushes must be even multiple of 8 + + ; align rsp to 32 Bytes + and rsp, ~0x1F + + ;; Initialize digests + vmovdqu A, [arg1 + 0*32] + vmovdqu B, [arg1 + 1*32] + vmovdqu C, [arg1 + 2*32] + vmovdqu D, [arg1 + 3*32] + vmovdqu E, [arg1 + 4*32] + + ;; transpose input onto stack + mov inp0,[arg1+_data_ptr+0*8] + mov inp1,[arg1+_data_ptr+1*8] + mov inp2,[arg1+_data_ptr+2*8] + mov inp3,[arg1+_data_ptr+3*8] + mov inp4,[arg1+_data_ptr+4*8] + mov inp5,[arg1+_data_ptr+5*8] + mov inp6,[arg1+_data_ptr+6*8] + mov inp7,[arg1+_data_ptr+7*8] + + xor IDX, IDX +lloop: + vmovdqu F, [PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 2 + VMOVPS T0,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T2,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + VMOVPS T4,[inp4+IDX] + VMOVPS T5,[inp5+IDX] + VMOVPS T6,[inp6+IDX] + VMOVPS T7,[inp7+IDX] + TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + vpshufb T0, T0, F + vmovdqu [rsp+(I*8+0)*32],T0 + vpshufb T1, T1, F + vmovdqu [rsp+(I*8+1)*32],T1 + vpshufb T2, T2, F + vmovdqu [rsp+(I*8+2)*32],T2 + vpshufb T3, T3, F + vmovdqu [rsp+(I*8+3)*32],T3 + vpshufb T4, T4, F + vmovdqu [rsp+(I*8+4)*32],T4 + vpshufb T5, T5, F + vmovdqu [rsp+(I*8+5)*32],T5 + vpshufb T6, T6, F + vmovdqu [rsp+(I*8+6)*32],T6 + vpshufb T7, T7, F + vmovdqu [rsp+(I*8+7)*32],T7 + add IDX, 32 +%assign I (I+1) +%endrep + + + ; save old digests + vmovdqu AA, A + vmovdqu BB, B + vmovdqu CC, C + vmovdqu DD, D + vmovdqu EE, E + +;; +;; perform 0-79 steps +;; + vmovdqu K, [K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqu W16, [rsp + ((16 - 16) & 15) * 32] + vmovdqu W15, [rsp + ((16 - 15) & 15) * 32] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqu K, [K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqu K, [K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqu K, [K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + vmovdqu [arg1 + 0*32], A + vmovdqu [arg1 + 1*32], B + vmovdqu [arg1 + 2*32], C + vmovdqu [arg1 + 3*32], D + vmovdqu [arg1 + 4*32], E + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1+_data_ptr+0*8], inp0 + mov [arg1+_data_ptr+1*8], inp1 + mov [arg1+_data_ptr+2*8], inp2 + mov [arg1+_data_ptr+3*8], inp3 + mov [arg1+_data_ptr+4*8], inp4 + mov [arg1+_data_ptr+5*8], inp5 + mov [arg1+_data_ptr+6*8], inp6 + mov [arg1+_data_ptr+7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + mov rsp, RSP_SAVE + + pop RSP_SAVE + ret + + + +section .data align=32 + +align 32 +K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c new file mode 100644 index 000000000..e778c5d98 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c @@ -0,0 +1,112 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include "sha1_mb.h" +#include "test.h" + +// Test messages +#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" +uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +uint8_t msg3[] = TST_STR TST_STR "0123456789:;<"; +uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?"; +uint8_t msg6[] = + TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +uint8_t msg7[] = ""; + +// Expected digests +uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 }; +uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D }; +uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 }; +uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 }; +uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 }; +uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 }; +uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 }; + +uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; +uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 }; + +int check_job(uint32_t * ref, uint32_t * good, int words) +{ + int i; + for (i = 0; i < words; i++) + if (good[i] != ref[i]) + return 1; + + return 0; +} + +#define MAX_MSGS 7 + +int main(void) +{ + SHA1_HASH_CTX_MGR *mgr = NULL; + SHA1_HASH_CTX ctxpool[MAX_MSGS]; + SHA1_HASH_CTX *p_job; + int i, checked = 0, failed = 0; + int n = sizeof(msgs) / sizeof(msgs[0]); + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + // Initialize multi-buffer manager + sha1_ctx_mgr_init(mgr); + + for (i = 0; i < n; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)expected_digest[i]; + + p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (p_job) { // If we have finished a job, process it + checked++; + failed += + check_job(p_job->job.result_digest, p_job->user_data, + SHA1_DIGEST_NWORDS); + } + } + + // Finish remaining jobs + while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) { + checked++; + failed += + check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS); + } + + printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed); + return failed; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm new file mode 100644 index 000000000..c205f2389 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm @@ -0,0 +1,131 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define WRT_OPT wrt ..plt +%else +%define WRT_OPT +%endif + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha1_ctx_mgr_init_sse +extern sha1_ctx_mgr_submit_sse +extern sha1_ctx_mgr_flush_sse + +extern sha1_ctx_mgr_init_avx +extern sha1_ctx_mgr_submit_avx +extern sha1_ctx_mgr_flush_avx + +extern sha1_ctx_mgr_init_avx2 +extern sha1_ctx_mgr_submit_avx2 +extern sha1_ctx_mgr_flush_avx2 + +extern sha1_ctx_mgr_init_base +extern sha1_ctx_mgr_submit_base +extern sha1_ctx_mgr_flush_base + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha1_ctx_mgr_init_avx512 + extern sha1_ctx_mgr_submit_avx512 + extern sha1_ctx_mgr_flush_avx512 +%endif + +%ifdef HAVE_AS_KNOWS_SHANI + extern sha1_ctx_mgr_init_sse_ni + extern sha1_ctx_mgr_submit_sse_ni + extern sha1_ctx_mgr_flush_sse_ni +%endif + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + extern sha1_ctx_mgr_init_avx512_ni + extern sha1_ctx_mgr_submit_avx512_ni + extern sha1_ctx_mgr_flush_avx512_ni + %endif +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha1_ctx_mgr_init +mbin_interface sha1_ctx_mgr_submit +mbin_interface sha1_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6's extension through replacing base by sse version + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \ + sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \ + sha1_ctx_mgr_init_avx512, sha1_ctx_mgr_init_sse_ni, sha1_ctx_mgr_init_avx512_ni + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \ + sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \ + sha1_ctx_mgr_submit_avx512, sha1_ctx_mgr_submit_sse_ni, sha1_ctx_mgr_submit_avx512_ni + mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \ + sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \ + sha1_ctx_mgr_flush_avx512, sha1_ctx_mgr_flush_sse_ni, sha1_ctx_mgr_flush_avx512_ni + %else + mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \ + sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \ + sha1_ctx_mgr_init_avx512 + mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \ + sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \ + sha1_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \ + sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \ + sha1_ctx_mgr_flush_avx512 + %endif +%else + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \ + sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_sse_ni + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \ + sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_sse_ni + mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \ + sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_sse_ni + %else + mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \ + sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2 + mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \ + sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2 + mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \ + sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2 + %endif +%endif + +;;; func core, ver, snum +slversion sha1_ctx_mgr_init, 00, 04, 0148 +slversion sha1_ctx_mgr_submit, 00, 04, 0149 +slversion sha1_ctx_mgr_flush, 00, 04, 0150 diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm new file mode 100644 index 000000000..86d09e303 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm @@ -0,0 +1,318 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 32 ; space for ABCDE +%define RSPSAVE rax + +%define ABCD xmm0 +; two E's b/c for ping-pong +%define E0 xmm1 +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 +%define SHUF_MASK xmm7 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define TMP r9 ; local variable -- assistant to address digest +;%define TMP2 r8 ; local variable -- assistant to address digest +align 32 + +; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r11, xmm0-xmm7 +; +mk_global sha1_ni_x1, function, internal +sha1_ni_x1: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*IDX] + ;; Initialize digest + pinsrd ABCD, [TMP + 0*NLANX4], 3 + pinsrd ABCD, [TMP + 1*NLANX4], 2 + pinsrd ABCD, [TMP + 2*NLANX4], 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCD, [TMP + 1*NLANX4], 0 + pinsrd E0, [TMP + 2*NLANX4], 3 + pand E0, [IDX3_WORD_MASK] + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; Save hash values for addition after rounds + movdqa [rsp + 0*16], E0 + movdqa [rsp + 1*16], ABCD + + ; do rounds 0-3 + movdqu MSG0, [DPTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + ; do rounds 4-7 + movdqu MSG1, [DPTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + ; do rounds 8-11 + movdqu MSG2, [DPTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 12-15 + movdqu MSG3, [DPTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + ; do rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + ; do rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + ; do rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + ; do rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + ; do rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + ; do rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + ; Add current hash values with previously saved + sha1nexte E0, [rsp + 0*16] + paddd ABCD, [rsp + 1*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*IDX] + pextrd [TMP + 0*NLANX4], ABCD, 3 + pextrd [TMP + 1*NLANX4], ABCD, 2 + pextrd [TMP + 2*NLANX4], ABCD, 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], ABCD, 0 + pextrd [TMP + 2*NLANX4], E0, 3 + + ; update input pointers + mov [MGR + _data_ptr + IDX*8], DPTR + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + + mov rsp, RSPSAVE + + ret + + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_ni_x1 +no_sha1_ni_x1: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm new file mode 100644 index 000000000..7b0ddb74e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm @@ -0,0 +1,484 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 64 ; space for ABCDE +%define RSPSAVE rax + +%define ABCD xmm0 +; two E's b/c for ping-pong +%define E0 xmm1 +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 + +%define ABCDb xmm7 +%define E0b xmm8 ; Need two E's b/c they ping pong +%define E1b xmm9 +%define MSG0b xmm10 +%define MSG1b xmm11 +%define MSG2b xmm12 +%define MSG3b xmm13 + +%define SHUF_MASK xmm14 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 + +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define DPTRb r12 ; +%define TMP r9 ; local variable -- assistant to address digest +%define TMPb r13 ; local variable -- assistant to address digest +align 32 + +; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r13, xmm0-xmm14 +; +mk_global sha1_ni_x2, function, internal +sha1_ni_x2: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + _args_digest ]; + lea TMPb,[MGR + _args_digest + 4*1]; + + ;; Initialize digest + pinsrd ABCD, [TMP + 0*NLANX4], 3 + pinsrd ABCD, [TMP + 1*NLANX4], 2 + pinsrd ABCD, [TMP + 2*NLANX4], 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCD, [TMP + 1*NLANX4], 0 + pinsrd E0, [TMP + 2*NLANX4], 3 + pand E0, [IDX3_WORD_MASK] + + pinsrd ABCDb, [TMPb + 0*NLANX4], 3 + pinsrd ABCDb, [TMPb + 1*NLANX4], 2 + pinsrd ABCDb, [TMPb + 2*NLANX4], 1 + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd ABCDb, [TMPb + 1*NLANX4], 0 + pinsrd E0b, [TMPb + 2*NLANX4], 3 + pand E0b, [IDX3_WORD_MASK] + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr ] + mov DPTRb,[MGR + _data_ptr + 8*1] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + movdqa [rsp + 0*16], E0 + movdqa [rsp + 1*16], ABCD + + movdqa [rsp + 2*16], E0b + movdqa [rsp + 3*16], ABCDb + + ; do rounds 0-3 + movdqu MSG0, [DPTR + 0*16] + pshufb MSG0, SHUF_MASK + paddd E0, MSG0 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + + movdqu MSG0b, [DPTRb + 0*16] + pshufb MSG0b, SHUF_MASK + paddd E0b, MSG0b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + + ; do rounds 4-7 + movdqu MSG1, [DPTR + 1*16] + pshufb MSG1, SHUF_MASK + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + + movdqu MSG1b, [DPTRb + 1*16] + pshufb MSG1b, SHUF_MASK + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG0b, MSG1b + + ; do rounds 8-11 + movdqu MSG2, [DPTR + 2*16] + pshufb MSG2, SHUF_MASK + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + movdqu MSG2b, [DPTRb + 2*16] + pshufb MSG2b, SHUF_MASK + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 12-15 + movdqu MSG3, [DPTR + 3*16] + pshufb MSG3, SHUF_MASK + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + movdqu MSG3b, [DPTRb + 3*16] + pshufb MSG3b, SHUF_MASK + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ; do rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ; do rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 3 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ; do rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 3 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ; do rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 3 + pxor MSG3b, MSG1b + + ; do rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 3 + + ; do rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 3 + + ; Add current hash values with previously saved + sha1nexte E0, [rsp + 0*16] + paddd ABCD, [rsp + 1*16] + + sha1nexte E0b, [rsp + 2*16] + paddd ABCDb, [rsp + 3*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + add DPTRb, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + _args_digest] + pextrd [TMP + 0*NLANX4], ABCD, 3 + pextrd [TMP + 1*NLANX4], ABCD, 2 + pextrd [TMP + 2*NLANX4], ABCD, 1 + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], ABCD, 0 + pextrd [TMP + 2*NLANX4], E0, 3 + + lea TMPb, [MGR +_args_digest + 4*1] + pextrd [TMPb + 0*NLANX4], ABCDb, 3 + pextrd [TMPb + 1*NLANX4], ABCDb, 2 + pextrd [TMPb + 2*NLANX4], ABCDb, 1 + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMPb + 1*NLANX4], ABCDb, 0 + pextrd [TMPb + 2*NLANX4], E0b, 3 + + ; update input pointers + mov [MGR + _data_ptr], DPTR + mov [MGR + _data_ptr + 8*1], DPTRb + +backto_mgr: +;;;;;;;;;;;;;;;; +;; Postamble + + mov rsp, RSPSAVE + + ret + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha1_ni_x2 +no_sha1_ni_x2: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm new file mode 100644 index 000000000..aeb00a008 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm @@ -0,0 +1,485 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha1_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 +_WK_SAVE_SIZE equ 16*4 + +_WK_SAVE equ 0 +_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +; rax~rdx, rsi, rdi, rbp are used for RR +%define N_MGR r8 +%define IDX r9 ; local variable -- consistent with caller +%define K_BASE r11 +%define BUFFER_PTR r12 +%define BUFFER_END r13 +%define TMP r14 ; local variable -- assistant to address digest + +%xdefine W_TMP xmm0 +%xdefine W_TMP2 xmm9 + +%xdefine W0 xmm1 +%xdefine W4 xmm2 +%xdefine W8 xmm3 +%xdefine W12 xmm4 +%xdefine W16 xmm5 +%xdefine W20 xmm6 +%xdefine W24 xmm7 +%xdefine W28 xmm8 + +%xdefine XMM_SHUFB_BSWAP xmm10 + +;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer +%xdefine WK(t) (rsp + (t & 15)*4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Constants + +%xdefine K1 0x5a827999 +%xdefine K2 0x6ed9eba1 +%xdefine K3 0x8f1bbcdc +%xdefine K4 0xca62c1d6 + +%xdefine W_PRECALC_AHEAD 16 +%xdefine W_NO_TAIL_PRECALC 0 + +; Rounds macros + +%macro REGALLOC 0 + %xdefine A ecx + %xdefine B esi + %xdefine C edi + %xdefine D ebp + %xdefine E edx + + %xdefine T1 eax + %xdefine T2 ebx +%endmacro + +%macro F1 3 + mov T1,%2 + xor T1,%3 + and T1,%1 + xor T1,%3 +%endmacro + +%macro F2 3 + mov T1,%3 + xor T1,%2 + xor T1,%1 +%endmacro + +%macro F3 3 + mov T1,%2 + mov T2,%1 + or T1,%1 + and T2,%2 + and T1,%3 + or T1,T2 +%endmacro + +%define F4 F2 + +%macro UPDATE_HASH 2 + add %2, %1 + mov %1, %2 +%endmacro + + +%macro W_PRECALC 1 + %xdefine i (%1) + + %if (i < 20) + %xdefine K_XMM 0 + %elif (i < 40) + %xdefine K_XMM 16 + %elif (i < 60) + %xdefine K_XMM 32 + %else + %xdefine K_XMM 48 + %endif + + %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) + + %if (W_NO_TAIL_PRECALC == 0) + + %xdefine i ((%1) % 80) ;; pre-compute for the next iteration + + %if (i == 0) + W_PRECALC_RESET + %endif + + + W_PRECALC_00_15 + %endif + + %elif (i < 32) + W_PRECALC_16_31 + %elif (i < 80) ;; rounds 32-79 + W_PRECALC_32_79 + %endif +%endmacro + +%macro W_PRECALC_RESET 0 + %xdefine W W0 + %xdefine W_minus_04 W4 + %xdefine W_minus_08 W8 + %xdefine W_minus_12 W12 + %xdefine W_minus_16 W16 + %xdefine W_minus_20 W20 + %xdefine W_minus_24 W24 + %xdefine W_minus_28 W28 + %xdefine W_minus_32 W +%endmacro + +%macro W_PRECALC_ROTATE 0 + %xdefine W_minus_32 W_minus_28 + %xdefine W_minus_28 W_minus_24 + %xdefine W_minus_24 W_minus_20 + %xdefine W_minus_20 W_minus_16 + %xdefine W_minus_16 W_minus_12 + %xdefine W_minus_12 W_minus_08 + %xdefine W_minus_08 W_minus_04 + %xdefine W_minus_04 W + %xdefine W W_minus_32 +%endmacro + +%macro W_PRECALC_00_15 0 + ;; message scheduling pre-compute for rounds 0-15 + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqu W_TMP, [BUFFER_PTR + (i * 4)] + %elif ((i & 3) == 1) + pshufb W_TMP, XMM_SHUFB_BSWAP + movdqa W, W_TMP + %elif ((i & 3) == 2) + paddd W_TMP, [K_BASE] + %elif ((i & 3) == 3) + movdqa [WK(i&~3)], W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_16_31 0 + ;; message scheduling pre-compute for rounds 16-31 + ;; calculating last 32 w[i] values in 8 XMM registers + ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction + ;; + ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W, W_minus_12 + palignr W, W_minus_16, 8 ;; w[i-14] + movdqa W_TMP, W_minus_04 + psrldq W_TMP, 4 ;; w[i-3] + pxor W, W_minus_08 + %elif ((i & 3) == 1) + pxor W_TMP, W_minus_16 + pxor W, W_TMP + movdqa W_TMP2, W + movdqa W_TMP, W + pslldq W_TMP2, 12 + %elif ((i & 3) == 2) + psrld W, 31 + pslld W_TMP, 1 + por W_TMP, W + movdqa W, W_TMP2 + psrld W_TMP2, 30 + pslld W, 2 + %elif ((i & 3) == 3) + pxor W_TMP, W + pxor W_TMP, W_TMP2 + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro W_PRECALC_32_79 0 + ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + ;; + %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds + movdqa W_TMP, W_minus_04 + pxor W, W_minus_28 ;; W is W_minus_32 before xor + palignr W_TMP, W_minus_08, 8 + %elif ((i & 3) == 1) + pxor W, W_minus_16 + pxor W, W_TMP + movdqa W_TMP, W + %elif ((i & 3) == 2) + psrld W, 30 + pslld W_TMP, 2 + por W_TMP, W + %elif ((i & 3) == 3) + movdqa W, W_TMP + paddd W_TMP, [K_BASE + K_XMM] + movdqa [WK(i&~3)],W_TMP + + W_PRECALC_ROTATE + %endif +%endmacro + +%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation + + ;; TEMP = A + ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) + ;; C = ROTATE_LEFT( B, 30 ) + ;; D = C + ;; E = D + ;; B = TEMP + + W_PRECALC (%6 + W_PRECALC_AHEAD) + F %2, %3, %4 ;; F returns result in T1 + add %5, [WK(%6)] + rol %2, 30 + mov T2, %1 + add %4, [WK(%6 + 1)] + rol T2, 5 + add %5, T1 + + W_PRECALC (%6 + W_PRECALC_AHEAD + 1) + add T2, %5 + mov %5, T2 + rol T2, 5 + add %4, T2 + F %1, %2, %3 ;; F returns result in T1 + add %4, T1 + rol %1, 30 + +;; write: %1, %2 +;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: all general regs (except r15), xmm0-xmm10 +; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} +; +mk_global sha1_opt_x1, function, internal +sha1_opt_x1: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rdi + mov [rsp + _GPR_SAVE + 8*3], rsi + ; caller has already stored XMM6~10 +%endif + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 + mov [rsp + _GPR_SAVE + 8*8], rdx + + + shl NBLK, 6 ; transform blk amount into bytes + jz .lend + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + ;; let sha1_opt sb takes over r8~r11 + ;; Load input pointers + mov N_MGR, MGR + mov BUFFER_PTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, BUFFER_PTR + mov BUFFER_END, NBLK + + lea K_BASE, [K_XMM_AR] + movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl] + + REGALLOC + + lea TMP, [N_MGR + 4*IDX] + ;; Initialize digest + mov A, [TMP + 0*NLANX4] + mov B, [TMP + 1*NLANX4] + mov C, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + mov D, [TMP + 1*NLANX4] + mov E, [TMP + 2*NLANX4] + + %assign i 0 + %rep W_PRECALC_AHEAD + W_PRECALC i + %assign i i+1 + %endrep + + %xdefine F F1 + +.lloop: + cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, + jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE + jmp .lend + +.lbegin: + RR A,B,C,D,E,0 + RR D,E,A,B,C,2 + RR B,C,D,E,A,4 + RR E,A,B,C,D,6 + RR C,D,E,A,B,8 + + RR A,B,C,D,E,10 + RR D,E,A,B,C,12 + RR B,C,D,E,A,14 + RR E,A,B,C,D,16 + RR C,D,E,A,B,18 + + %xdefine F F2 + + RR A,B,C,D,E,20 + RR D,E,A,B,C,22 + RR B,C,D,E,A,24 + RR E,A,B,C,D,26 + RR C,D,E,A,B,28 + + RR A,B,C,D,E,30 + RR D,E,A,B,C,32 + RR B,C,D,E,A,34 + RR E,A,B,C,D,36 + RR C,D,E,A,B,38 + + %xdefine F F3 + + RR A,B,C,D,E,40 + RR D,E,A,B,C,42 + RR B,C,D,E,A,44 + RR E,A,B,C,D,46 + RR C,D,E,A,B,48 + + RR A,B,C,D,E,50 + RR D,E,A,B,C,52 + RR B,C,D,E,A,54 + RR E,A,B,C,D,56 + RR C,D,E,A,B,58 + + %xdefine F F4 + + add BUFFER_PTR, 64 ;; move to next 64-byte block + cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one + cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration + + RR A,B,C,D,E,60 + RR D,E,A,B,C,62 + RR B,C,D,E,A,64 + RR E,A,B,C,D,66 + RR C,D,E,A,B,68 + + RR A,B,C,D,E,70 + RR D,E,A,B,C,72 + RR B,C,D,E,A,74 + RR E,A,B,C,D,76 + RR C,D,E,A,B,78 + + lea TMP, [N_MGR + 4*IDX] + UPDATE_HASH [TMP + 0*NLANX4],A + UPDATE_HASH [TMP + 1*NLANX4],B + UPDATE_HASH [TMP + 2*NLANX4],C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + UPDATE_HASH [TMP + 1*NLANX4],D + UPDATE_HASH [TMP + 2*NLANX4],E + + jmp .lloop + + .lend: + mov MGR, N_MGR + + mov rdx, [rsp + _GPR_SAVE + 8*8] + mov r15, [rsp + _GPR_SAVE + 8*7] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r12, [rsp + _GPR_SAVE + 8*4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + _GPR_SAVE + 8*3] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbp, [rsp + _GPR_SAVE + 8*1] + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SPACE + + ret + + +;;---------------------- +section .data align=64 + +align 128 +K_XMM_AR: + DD K1, K1, K1, K1 + DD K2, K2, K2, K2 + DD K3, K3, K3, K3 + DD K4, K4, K4, K4 + +align 16 +bswap_shufb_ctl: + DD 00010203h + DD 04050607h + DD 08090a0bh + DD 0c0d0e0fh diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c new file mode 100644 index 000000000..e82fb30fe --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c @@ -0,0 +1,220 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "sha1_mb.h" +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA1 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define H0 0x67452301 +#define H1 0xefcdab89 +#define H2 0x98badcfe +#define H3 0x10325476 +#define H4 0xc3d2e1f0 + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (b ^ c ^ d) +#define F3(b,c,d) ((b & c) | (d & (b | c))) +#define F4(b,c,d) (b ^ c ^ d) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define W(x) w[(x) & 15] + +#define step00_19(i,a,b,c,d,e) \ + if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + else W(i) = to_be32(ww[i]); \ + e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \ + b = rol32(b,30) + +#define step20_39(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \ + b = rol32(b,30) + +#define step40_59(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \ + b = rol32(b,30) + +#define step60_79(i,a,b,c,d,e) \ + W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \ + e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \ + b = rol32(b,30) + +static void OPT_FIX sha1_single(const uint8_t * data, uint32_t digest[]); + +void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA1_BLOCK_SIZE]; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + + i = len; + while (i >= SHA1_BLOCK_SIZE) { + sha1_single(input_data, digest); + input_data += SHA1_BLOCK_SIZE; + i -= SHA1_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE) + i = 2 * SHA1_BLOCK_SIZE; + else + i = SHA1_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha1_single(buf, digest); + if (i == (2 * SHA1_BLOCK_SIZE)) + sha1_single(buf + SHA1_BLOCK_SIZE, digest); +} + +void sha1_single(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e; + uint32_t w[16] = { 0 }; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + + step00_19(0, a, b, c, d, e); + step00_19(1, e, a, b, c, d); + step00_19(2, d, e, a, b, c); + step00_19(3, c, d, e, a, b); + step00_19(4, b, c, d, e, a); + step00_19(5, a, b, c, d, e); + step00_19(6, e, a, b, c, d); + step00_19(7, d, e, a, b, c); + step00_19(8, c, d, e, a, b); + step00_19(9, b, c, d, e, a); + step00_19(10, a, b, c, d, e); + step00_19(11, e, a, b, c, d); + step00_19(12, d, e, a, b, c); + step00_19(13, c, d, e, a, b); + step00_19(14, b, c, d, e, a); + step00_19(15, a, b, c, d, e); + step00_19(16, e, a, b, c, d); + step00_19(17, d, e, a, b, c); + step00_19(18, c, d, e, a, b); + step00_19(19, b, c, d, e, a); + + step20_39(20, a, b, c, d, e); + step20_39(21, e, a, b, c, d); + step20_39(22, d, e, a, b, c); + step20_39(23, c, d, e, a, b); + step20_39(24, b, c, d, e, a); + step20_39(25, a, b, c, d, e); + step20_39(26, e, a, b, c, d); + step20_39(27, d, e, a, b, c); + step20_39(28, c, d, e, a, b); + step20_39(29, b, c, d, e, a); + step20_39(30, a, b, c, d, e); + step20_39(31, e, a, b, c, d); + step20_39(32, d, e, a, b, c); + step20_39(33, c, d, e, a, b); + step20_39(34, b, c, d, e, a); + step20_39(35, a, b, c, d, e); + step20_39(36, e, a, b, c, d); + step20_39(37, d, e, a, b, c); + step20_39(38, c, d, e, a, b); + step20_39(39, b, c, d, e, a); + + step40_59(40, a, b, c, d, e); + step40_59(41, e, a, b, c, d); + step40_59(42, d, e, a, b, c); + step40_59(43, c, d, e, a, b); + step40_59(44, b, c, d, e, a); + step40_59(45, a, b, c, d, e); + step40_59(46, e, a, b, c, d); + step40_59(47, d, e, a, b, c); + step40_59(48, c, d, e, a, b); + step40_59(49, b, c, d, e, a); + step40_59(50, a, b, c, d, e); + step40_59(51, e, a, b, c, d); + step40_59(52, d, e, a, b, c); + step40_59(53, c, d, e, a, b); + step40_59(54, b, c, d, e, a); + step40_59(55, a, b, c, d, e); + step40_59(56, e, a, b, c, d); + step40_59(57, d, e, a, b, c); + step40_59(58, c, d, e, a, b); + step40_59(59, b, c, d, e, a); + + step60_79(60, a, b, c, d, e); + step60_79(61, e, a, b, c, d); + step60_79(62, d, e, a, b, c); + step60_79(63, c, d, e, a, b); + step60_79(64, b, c, d, e, a); + step60_79(65, a, b, c, d, e); + step60_79(66, e, a, b, c, d); + step60_79(67, d, e, a, b, c); + step60_79(68, c, d, e, a, b); + step60_79(69, b, c, d, e, a); + step60_79(70, a, b, c, d, e); + step60_79(71, e, a, b, c, d); + step60_79(72, d, e, a, b, c); + step60_79(73, c, d, e, a, b); + step60_79(74, b, c, d, e, a); + step60_79(75, a, b, c, d, e); + step60_79(76, e, a, b, c, d); + step60_79(77, d, e, a, b, c); + step60_79(78, c, d, e, a, b); + step60_79(79, b, c, d, e, a); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; +} |