diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto/isa-l/isa-l_crypto/mh_sha256 | |
parent | Initial commit. (diff) | |
download | ceph-upstream/18.2.2.tar.xz ceph-upstream/18.2.2.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/mh_sha256')
22 files changed, 5658 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am new file mode 100644 index 000000000..d6e8b61ab --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am @@ -0,0 +1,88 @@ +######################################################################## +# Copyright(c) 2011-2017 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_sha256 = mh_sha256/sha256_for_mh_sha256.c + +lsrc_mh_sha256 = mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_block_sse.asm \ + mh_sha256/mh_sha256_block_avx.asm \ + mh_sha256/mh_sha256_block_avx2.asm \ + mh_sha256/mh_sha256_multibinary.asm \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +lsrc_mh_sha256 += mh_sha256/mh_sha256_block_avx512.asm \ + mh_sha256/mh_sha256_avx512.c + +lsrc_x86_64 += $(lsrc_sha256) \ + $(lsrc_mh_sha256) + +lsrc_x86_32 += $(lsrc_x86_64) + +other_src += mh_sha256/mh_sha256_ref.c \ + include/reg_sizes.asm \ + include/multibinary.asm \ + include/test.h \ + mh_sha256/mh_sha256_internal.h + +lsrc_aarch64 += $(lsrc_sha256) \ + mh_sha256/aarch64/mh_sha256_multibinary.S \ + mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c \ + mh_sha256/aarch64/mh_sha256_block_ce.S \ + mh_sha256/aarch64/mh_sha256_ce.c \ + mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +lsrc_base_aliases += $(lsrc_sha256) \ + mh_sha256/mh_sha256_base_aliases.c \ + mh_sha256/mh_sha256.c \ + mh_sha256/mh_sha256_finalize_base.c \ + mh_sha256/mh_sha256_update_base.c \ + mh_sha256/mh_sha256_block_base.c + +src_include += -I $(srcdir)/mh_sha256 + +extern_hdrs += include/mh_sha256.h + +check_tests += mh_sha256/mh_sha256_test +unit_tests += mh_sha256/mh_sha256_update_test + +perf_tests += mh_sha256/mh_sha256_perf + + +mh_sha256_test: mh_sha256_ref.o +mh_sha256_mh_sha256_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la + +mh_sha256_update_test: mh_sha256_ref.o +mh_sha256_mh_sha256_update_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la + +mh_sha256_mh_sha256_perf_LDADD = libisal_crypto.la diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c new file mode 100644 index 000000000..155790fc1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(mh_sha256_update) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(mh_sha256_update_ce); + + return PROVIDER_BASIC(mh_sha256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(mh_sha256_finalize) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(mh_sha256_finalize_ce); + + return PROVIDER_BASIC(mh_sha256_finalize); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S new file mode 100644 index 000000000..53a78ea7d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S @@ -0,0 +1,731 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 6 + + .global mh_sha256_block_ce + .type mh_sha256_block_ce, %function + +/* +Macros +*/ + +.macro declare_vector_reg name:req,reg:req,default:req + \name .req \default\reg + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + +declare_vector_reg lane0_msg0, 0,v +declare_vector_reg lane1_msg0, 1,v +declare_vector_reg lane2_msg0, 2,v +declare_vector_reg lane3_msg0, 3,v + +declare_vector_reg lane0_msg1, 4,v +declare_vector_reg lane1_msg1, 5,v +declare_vector_reg lane2_msg1, 6,v +declare_vector_reg lane3_msg1, 7,v + +declare_vector_reg lane0_msg2, 8,v +declare_vector_reg lane1_msg2, 9,v +declare_vector_reg lane2_msg2, 10,v +declare_vector_reg lane3_msg2, 11,v + +declare_vector_reg lane0_msg3, 12,v +declare_vector_reg lane1_msg3, 13,v +declare_vector_reg lane2_msg3, 14,v +declare_vector_reg lane3_msg3, 15,v + +declare_vector_reg lane0_state0, 16,v +declare_vector_reg lane1_state0, 17,v +declare_vector_reg lane2_state0, 18,v +declare_vector_reg lane3_state0, 19,v + +declare_vector_reg lane0_state1, 20,v +declare_vector_reg lane1_state1, 21,v +declare_vector_reg lane2_state1, 22,v +declare_vector_reg lane3_state1, 23,v + +declare_vector_reg lane0_tmp0, 24,v +declare_vector_reg lane1_tmp0, 25,v +declare_vector_reg lane2_tmp0, 26,v +declare_vector_reg lane3_tmp0, 27,v + +declare_vector_reg lane0_tmp2, 28,v +declare_vector_reg lane1_tmp2, 29,v +declare_vector_reg lane2_tmp2, 30,v +declare_vector_reg lane3_tmp2, 31,v + +declare_vector_reg key, 27,v +declare_vector_reg tmp, 29,v + +/* +void mh_sha256_block_ce(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], + uint32_t num_blocks); +*/ + x_input_data .req x0 + x_digests .req x1 + x_frame_buffer .req x2 + w_num_blocks .req w3 + + x_digest_addr .req x4 + x_key_addr .req x5 + x_msg_addr .req x6 + x_lane_offs .req x7 + x_offs .req x9 + w_input_data_end .req w10 + x_input_data_end .req x10 + x_tmp .req x11 +mh_sha256_block_ce: + cbz w_num_blocks, .exit + mov w_input_data_end, w_num_blocks + + ubfiz x_input_data_end, x_input_data_end, 10, 32 + add x_input_data_end, x_input_data, x_input_data_end + + adrp x_key_addr, .key_addr + add x_key_addr, x_key_addr, :lo12:.key_addr + + stp d8, d9, [sp, -192]! + + stp d10, d11, [sp, 16] + stp d12, d13, [sp, 32] + stp d14, d15, [sp, 48] + + .p2align 3,,7 +.start_loop: + mov x_lane_offs, 0 + mov x_digest_addr, x_digests + +.lane_loop: + add x_msg_addr, x_input_data, x_lane_offs, lsl 2 + + .p2align 3,,7 + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs + ld4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs + + add x_tmp, x_digest_addr, 256 + ld4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs + ld4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs + + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[3], [x_msg_addr], x_offs + + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[0], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[1], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[2], [x_msg_addr], x_offs + ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[3], [x_msg_addr], x_offs + + // reverse for little endian + rev32 v_lane0_msg0.16b, v_lane0_msg0.16b + rev32 v_lane1_msg0.16b, v_lane1_msg0.16b + rev32 v_lane2_msg0.16b, v_lane2_msg0.16b + rev32 v_lane3_msg0.16b, v_lane3_msg0.16b + + rev32 v_lane0_msg1.16b, v_lane0_msg1.16b + rev32 v_lane1_msg1.16b, v_lane1_msg1.16b + rev32 v_lane2_msg1.16b, v_lane2_msg1.16b + rev32 v_lane3_msg1.16b, v_lane3_msg1.16b + + rev32 v_lane0_msg2.16b, v_lane0_msg2.16b + rev32 v_lane1_msg2.16b, v_lane1_msg2.16b + rev32 v_lane2_msg2.16b, v_lane2_msg2.16b + rev32 v_lane3_msg2.16b, v_lane3_msg2.16b + + rev32 v_lane0_msg3.16b, v_lane0_msg3.16b + rev32 v_lane1_msg3.16b, v_lane1_msg3.16b + rev32 v_lane2_msg3.16b, v_lane2_msg3.16b + rev32 v_lane3_msg3.16b, v_lane3_msg3.16b + + // rounds 0-3 + ldr q_key, [x_key_addr] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + str q_lane0_state1, [sp, 64] + str q_lane1_state1, [sp, 80] + str q_lane2_state1, [sp, 96] + str q_lane3_state1, [sp, 112] + + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[0], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[1], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[2], [x_tmp], x_offs + ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[3], [x_tmp], x_offs + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 4-7 + ldr q_key, [x_key_addr, 16] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 8-11 + ldr q_key, [x_key_addr, 32] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 12-15 + ldr q_key, [x_key_addr, 48] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 16-19 + ldr q_key, [x_key_addr, 64] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 20-23 + ldr q_key, [x_key_addr, 80] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 24-27 + ldr q_key, [x_key_addr, 96] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 28-31 + ldr q_key, [x_key_addr, 112] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 32-35 + ldr q_key, [x_key_addr, 128] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s + + sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s + + // rounds 36-39 + ldr q_key, [x_key_addr, 144] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s + + sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s + + // rounds 40-43 + ldr q_key, [x_key_addr, 160] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s + sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s + sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s + sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s + + sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s + sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s + sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s + sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s + + // rounds 44-47 + ldr q_key, [x_key_addr, 176] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s + sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s + sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s + sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s + + sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s + sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s + sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s + sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s + + // rounds 48-51 + ldr q_key, [x_key_addr, 192] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 52-55 + ldr q_key, [x_key_addr, 208] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 56-59 + ldr q_key, [x_key_addr, 224] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + // rounds 60-63 + ldr q_key, [x_key_addr, 240] + add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s + add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s + add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s + add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s + + mov v_lane0_tmp2.16b, v_lane0_state0.16b + mov v_lane1_tmp2.16b, v_lane1_state0.16b + mov v_lane2_tmp2.16b, v_lane2_state0.16b + mov v_lane3_tmp2.16b, v_lane3_state0.16b + + sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s + sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s + sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s + sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s + + sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s + sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s + sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s + sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s + + mov x_offs, 64 + mov x_tmp, x_digest_addr + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[0], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[1], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[2], [x_tmp], x_offs + ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[3], [x_tmp], x_offs + + add v_lane0_state0.4s, v_lane0_tmp0.4s, v_lane0_state0.4s + add v_lane1_state0.4s, v_lane1_tmp0.4s, v_lane1_state0.4s + add v_lane2_state0.4s, v_lane2_tmp0.4s, v_lane2_state0.4s + add v_lane3_state0.4s, v_lane3_tmp0.4s, v_lane3_state0.4s + + mov x_offs, 64 + mov x_tmp, x_digest_addr + st4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs + st4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs + + ldp q_lane0_tmp2, q_lane1_tmp2, [sp, 64] + ldp q_lane2_tmp2, q_lane3_tmp2, [sp, 96] + + add v_lane0_state1.4s, v_lane0_tmp2.4s, v_lane0_state1.4s + add v_lane1_state1.4s, v_lane1_tmp2.4s, v_lane1_state1.4s + add v_lane2_state1.4s, v_lane2_tmp2.4s, v_lane2_state1.4s + add v_lane3_state1.4s, v_lane3_tmp2.4s, v_lane3_state1.4s + + mov x_offs, 64 + add x_tmp, x_digest_addr, 256 + st4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs + st4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs + + add x_digest_addr, x_digest_addr, 16 + add x_lane_offs, x_lane_offs, 4 + cmp x_lane_offs, 16 + bne .lane_loop + + add x_input_data, x_input_data, 1024 + cmp x_input_data, x_input_data_end + bne .start_loop + + ldp d10, d11, [sp, 16] + ldp d12, d13, [sp, 32] + ldp d14, d15, [sp, 48] + ldp d8, d9, [sp], 192 +.exit: + ret + .size mh_sha256_block_ce, .-mh_sha256_block_ce + + .section .rodata + .align 4 + .set .key_addr,. + 0 + .type K, %object + .size K, 256 +K: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c new file mode 100644 index 000000000..c42333ed5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c @@ -0,0 +1,53 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <string.h> +#include "mh_sha256_internal.h" + +void mh_sha256_block_ce(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +/***************mh_sha256_update***********/ +// mh_sha256_update_ce.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_ce +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail +// mh_sha256_finalize_ce.c and mh_sha256_tail_ce.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_ce +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_ce +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S new file mode 100644 index 000000000..54eece175 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include "aarch64_multibinary.h" + + +mbin_interface mh_sha256_update +mbin_interface mh_sha256_finalize diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c new file mode 100644 index 000000000..242c3e218 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c @@ -0,0 +1,143 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "mh_sha256_internal.h" + +int mh_sha256_init(struct mh_sha256_ctx *ctx) +{ + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint32_t i; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + memset(ctx, 0, sizeof(*ctx)); + + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + for (i = 0; i < HASH_SEGS; i++) { + mh_sha256_segs_digests[0][i] = MH_SHA256_H0; + mh_sha256_segs_digests[1][i] = MH_SHA256_H1; + mh_sha256_segs_digests[2][i] = MH_SHA256_H2; + mh_sha256_segs_digests[3][i] = MH_SHA256_H3; + mh_sha256_segs_digests[4][i] = MH_SHA256_H4; + mh_sha256_segs_digests[5][i] = MH_SHA256_H5; + mh_sha256_segs_digests[6][i] = MH_SHA256_H6; + mh_sha256_segs_digests[7][i] = MH_SHA256_H7; + } + + return MH_SHA256_CTX_ERROR_NONE; +} + +#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \ + || defined( _M_X64) || defined(_M_IX86)) +/***************mh_sha256_update***********/ +// mh_sha256_update_sse.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_sse +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_update_avx.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_update_avx2.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx2 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2 +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail + +// mh_sha256_finalize_sse.c and mh_sha256_tail_sse.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_sse +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_sse +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_finalize_avx.c and mh_sha256_tail_avx.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +// mh_sha256_finalize_avx2.c and mh_sha256_tail_avx2.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx2 +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx2 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2 +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************version info***********/ + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +// Version info +struct slver mh_sha256_init_slver_000002b1; +struct slver mh_sha256_init_slver = { 0x02b1, 0x00, 0x00 }; + +// mh_sha256_update version info +struct slver mh_sha256_update_sse_slver_000002b4; +struct slver mh_sha256_update_sse_slver = { 0x02b4, 0x00, 0x00 }; + +struct slver mh_sha256_update_avx_slver_020002b6; +struct slver mh_sha256_update_avx_slver = { 0x02b6, 0x00, 0x02 }; + +struct slver mh_sha256_update_avx2_slver_040002b8; +struct slver mh_sha256_update_avx2_slver = { 0x02b8, 0x00, 0x04 }; + +// mh_sha256_finalize version info +struct slver mh_sha256_finalize_sse_slver_000002b5; +struct slver mh_sha256_finalize_sse_slver = { 0x02b5, 0x00, 0x00 }; + +struct slver mh_sha256_finalize_avx_slver_020002b7; +struct slver mh_sha256_finalize_avx_slver = { 0x02b7, 0x00, 0x02 }; + +struct slver mh_sha256_finalize_avx2_slver_040002b9; +struct slver mh_sha256_finalize_avx2_slver = { 0x02b9, 0x00, 0x04 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c new file mode 100644 index 000000000..35fb0fbad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c @@ -0,0 +1,70 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "mh_sha256_internal.h" + +#ifdef HAVE_AS_KNOWS_AVX512 + +/***************mh_sha256_update***********/ +// mh_sha256_update_avx512.c +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx512 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512 +#include "mh_sha256_update_base.c" +#undef MH_SHA256_UPDATE_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************mh_sha256_finalize AND mh_sha256_tail***********/ +// mh_sha256_tail is used to calculate the last incomplete src data block +// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail +// mh_sha256_finalize_avx512.c and mh_sha256_tail_avx512.c +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx512 +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx512 +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512 +#include "mh_sha256_finalize_base.c" +#undef MH_SHA256_FINALIZE_FUNCTION +#undef MH_SHA256_TAIL_FUNCTION +#undef MH_SHA256_BLOCK_FUNCTION + +/***************version info***********/ +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// mh_sha256_update version info +struct slver mh_sha256_update_avx512_slver_060002bc; +struct slver mh_sha256_update_avx512_slver = { 0x02bc, 0x00, 0x06 }; + +// mh_sha256_finalize version info +struct slver mh_sha256_finalize_avx512_slver_060002bd; +struct slver mh_sha256_finalize_avx512_slver = { 0x02bd, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c new file mode 100644 index 000000000..343ffb024 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c @@ -0,0 +1,40 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "mh_sha256_internal.h" +#include <string.h> +int mh_sha256_update(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len) +{ + return mh_sha256_update_base(ctx, buffer, len); + +} + +int mh_sha256_finalize(struct mh_sha256_ctx *ctx, void *mh_sha256_digest) +{ + return mh_sha256_finalize_base(ctx, mh_sha256_digest); +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm new file mode 100644 index 000000000..c2eff350d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm @@ -0,0 +1,557 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa %%T1, [SZ4*(%%i&0xf) + %%data] + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + %%data], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + %%data] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + %%data] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + %%data] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_avx, function, internal +func(mh_sha256_block_avx) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by avx + and rsp, ~0x0F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 8 + VMOVPS a, [mh_digests_p + I*64 + 16*0] + VMOVPS b, [mh_digests_p + I*64 + 16*1] + VMOVPS c, [mh_digests_p + I*64 + 16*2] + VMOVPS d, [mh_digests_p + I*64 + 16*3] + + vmovdqa [rsp + I*64 + 16*0], a + vmovdqa [rsp + I*64 + 16*1], b + vmovdqa [rsp + I*64 + 16*2], c + vmovdqa [rsp + I*64 + 16*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4 + %assign I 0 + %rep 16 + VMOVPS TT0,[mh_in_p + I*64+0*16] + VMOVPS TT1,[mh_in_p + I*64+1*16] + VMOVPS TT2,[mh_in_p + I*64+2*16] + VMOVPS TT3,[mh_in_p + I*64+3*16] + + vpshufb TT0, TMP + vmovdqa [mh_data_p +(I)*16 +0*256],TT0 + vpshufb TT1, TMP + vmovdqa [mh_data_p +(I)*16 +1*256],TT1 + vpshufb TT2, TMP + vmovdqa [mh_data_p +(I)*16 +2*256],TT2 + vpshufb TT3, TMP + vmovdqa [mh_data_p +(I)*16 +3*256],TT3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + vmovdqa a, [rsp + 0*64 + mh_segs] + vmovdqa b, [rsp + 1*64 + mh_segs] + vmovdqa c, [rsp + 2*64 + mh_segs] + vmovdqa d, [rsp + 3*64 + mh_segs] + vmovdqa e, [rsp + 4*64 + mh_segs] + vmovdqa f, [rsp + 5*64 + mh_segs] + vmovdqa g, [rsp + 6*64 + mh_segs] + vmovdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + %if i = 48 + PREFETCH_X [mh_in_p + pref+128*1] + %endif + ROUND_16_XX T1, i, mh_data_p + %assign i (i+1) + %endrep + + ;; add old digest + vpaddd a, a, [rsp + 0*64 + mh_segs] + vpaddd b, b, [rsp + 1*64 + mh_segs] + vpaddd c, c, [rsp + 2*64 + mh_segs] + vpaddd d, d, [rsp + 3*64 + mh_segs] + vpaddd e, e, [rsp + 4*64 + mh_segs] + vpaddd f, f, [rsp + 5*64 + mh_segs] + vpaddd g, g, [rsp + 6*64 + mh_segs] + vpaddd h, h, [rsp + 7*64 + mh_segs] + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], a + vmovdqa [rsp + 1*64 + mh_segs], b + vmovdqa [rsp + 2*64 + mh_segs], c + vmovdqa [rsp + 3*64 + mh_segs], d + vmovdqa [rsp + 4*64 + mh_segs], e + vmovdqa [rsp + 5*64 + mh_segs], f + vmovdqa [rsp + 6*64 + mh_segs], g + vmovdqa [rsp + 7*64 + mh_segs], h + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 8 + vmovdqa a, [rsp + I*64 + 16*0] + vmovdqa b, [rsp + I*64 + 16*1] + vmovdqa c, [rsp + I*64 + 16*2] + vmovdqa d, [rsp + I*64 + 16*3] + + VMOVPS [mh_digests_p + I*64 + 16*0], a + VMOVPS [mh_digests_p + I*64 + 16*1], b + VMOVPS [mh_digests_p + I*64 + 16*2], c + VMOVPS [mh_digests_p + I*64 + 16*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm new file mode 100644 index 000000000..c2b3f2c59 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm @@ -0,0 +1,616 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX-2 +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define SZ 4 +%define SZ8 8*SZ +%define ROUNDS 64*SZ8 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define a0 ymm8 +%define a1 ymm9 +%define a2 ymm10 + +%define TT0 ymm14 +%define TT1 ymm13 +%define TT2 ymm12 +%define TT3 ymm11 +%define TT4 ymm10 +%define TT5 ymm9 + +%define T1 ymm14 +%define TMP ymm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa %%T1, [SZ8*(%%i&0xf) + %%data] + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ8*(%%i&0xf) + %%data], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + %%data] + vmovdqa a1, [SZ8*((%%i-2)&0xf) + %%data] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data] + vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + %%data] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_avx2, function, internal +func(mh_sha256_block_avx2) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 32 Bytes needed by avx2 + and rsp, ~0x1F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 4 + VMOVPS a, [mh_digests_p + I*64*2 + 32*0] + VMOVPS b, [mh_digests_p + I*64*2 + 32*1] + VMOVPS c, [mh_digests_p + I*64*2 + 32*2] + VMOVPS d, [mh_digests_p + I*64*2 + 32*3] + + vmovdqa [rsp + I*64*2 + 32*0], a + vmovdqa [rsp + I*64*2 + 32*1], b + vmovdqa [rsp + I*64*2 + 32*2], c + vmovdqa [rsp + I*64*2 + 32*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2 + %assign I 0 + %rep 16 + VMOVPS TT0,[mh_in_p + I*64+0*32] + VMOVPS TT1,[mh_in_p + I*64+1*32] + + vpshufb TT0, TT0, TMP + vmovdqa [mh_data_p +I*32 +0*512],TT0 + vpshufb TT1, TT1, TMP + vmovdqa [mh_data_p +I*32 +1*512],TT1 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 8 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + vmovdqa a, [rsp + 0*64 + mh_segs] + vmovdqa b, [rsp + 1*64 + mh_segs] + vmovdqa c, [rsp + 2*64 + mh_segs] + vmovdqa d, [rsp + 3*64 + mh_segs] + vmovdqa e, [rsp + 4*64 + mh_segs] + vmovdqa f, [rsp + 5*64 + mh_segs] + vmovdqa g, [rsp + 6*64 + mh_segs] + vmovdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + ROUND_16_XX T1, i, mh_data_p + %if i % 16 = 8 + PREFETCH_X [mh_in_p + pref+128*(i/16)] + %endif + %assign i (i+1) + %endrep + + ;; add old digest + vpaddd a, a, [rsp + 0*64 + mh_segs] + vpaddd b, b, [rsp + 1*64 + mh_segs] + vpaddd c, c, [rsp + 2*64 + mh_segs] + vpaddd d, d, [rsp + 3*64 + mh_segs] + vpaddd e, e, [rsp + 4*64 + mh_segs] + vpaddd f, f, [rsp + 5*64 + mh_segs] + vpaddd g, g, [rsp + 6*64 + mh_segs] + vpaddd h, h, [rsp + 7*64 + mh_segs] + + ; write out digests + vmovdqa [rsp + 0*64 + mh_segs], a + vmovdqa [rsp + 1*64 + mh_segs], b + vmovdqa [rsp + 2*64 + mh_segs], c + vmovdqa [rsp + 3*64 + mh_segs], d + vmovdqa [rsp + 4*64 + mh_segs], e + vmovdqa [rsp + 5*64 + mh_segs], f + vmovdqa [rsp + 6*64 + mh_segs], g + vmovdqa [rsp + 7*64 + mh_segs], h + + add pref, 512 + add mh_data_p, 512 + add mh_segs, 32 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 4 + vmovdqa a, [rsp + I*64*2 + 32*0] + vmovdqa b, [rsp + I*64*2 + 32*1] + vmovdqa c, [rsp + I*64*2 + 32*2] + vmovdqa d, [rsp + I*64*2 + 32*3] + + VMOVPS [mh_digests_p + I*64*2 + 32*0], a + VMOVPS [mh_digests_p + I*64*2 + 32*1], b + VMOVPS [mh_digests_p + I*64*2 + 32*2], c + VMOVPS [mh_digests_p + I*64*2 + 32*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm new file mode 100644 index 000000000..1ee76ddfc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm @@ -0,0 +1,682 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using AVX-512 +;; + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define VMOVPS vmovups + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%define APPEND(a,b) a %+ b +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 +;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 +;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 +;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 + +; Main processing loop per round +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddd T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, T1, %%WT ; T1 = T1 + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpaddd D, D, T1 ; D = D + T1 + + vprord H, A, 2 ; ROR_2(A) + vprord TMP2, A, 13 ; ROR_13(A) + vprord TMP3, A, 22 ; ROR_22(A) + vmovdqa32 TMP0, A + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_63 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +[bits 64] +section .text +align 32 + +;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +global mh_sha256_block_avx512 +func(mh_sha256_block_avx512) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 64 Bytes needed by avx512 + and rsp, ~0x3F + lea TBL,[TABLE] + + ; copy segs_digests into stack and ZMM + VMOVPS A, [mh_digests_p + 64*0] + VMOVPS B, [mh_digests_p + 64*1] + VMOVPS C, [mh_digests_p + 64*2] + VMOVPS D, [mh_digests_p + 64*3] + VMOVPS E, [mh_digests_p + 64*4] + VMOVPS F, [mh_digests_p + 64*5] + VMOVPS G, [mh_digests_p + 64*6] + VMOVPS H, [mh_digests_p + 64*7] + +.block_loop: + ; Save digests for later addition + vmovdqa32 [rsp + 64*0], A + vmovdqa32 [rsp + 64*1], B + vmovdqa32 [rsp + 64*2], C + vmovdqa32 [rsp + 64*3], D + vmovdqa32 [rsp + 64*4], E + vmovdqa32 [rsp + 64*5], F + vmovdqa32 [rsp + 64*6], G + vmovdqa32 [rsp + 64*7], H + + vmovdqa32 TMP3, [TBL] ; First K + ;transform to big-endian data and store on aligned_frame + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + ;using extra 16 ZMM registers instead of heap +%assign I 0 +%rep 8 +%assign J (I+1) + VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64] + VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64] + + vpshufb APPEND(W,I), APPEND(W,I), TMP2 + vpshufb APPEND(W,J), APPEND(W,J), TMP2 +%assign I (I+2) +%endrep + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first 48 rounds + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep 64 + PROCESS_LOOP APPEND(W,J), I + %if I < 48 + MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) + %endif + %if I % 8 = 4 + PREFETCH_X [mh_in_p + 1024+128*(I / 8)] + %endif +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + + ;; add old digest + vpaddd A, A, [rsp + 0*64] + vpaddd B, B, [rsp + 1*64] + vpaddd C, C, [rsp + 2*64] + vpaddd D, D, [rsp + 3*64] + vpaddd E, E, [rsp + 4*64] + vpaddd F, F, [rsp + 5*64] + vpaddd G, G, [rsp + 6*64] + vpaddd H, H, [rsp + 7*64] + + add mh_in_p, 1024 + sub loops, 1 + jne .block_loop + + ; copy segs_digests back to mh_digests_p + + VMOVPS [mh_digests_p + 64*0], A + VMOVPS [mh_digests_p + 64*1], B + VMOVPS [mh_digests_p + 64*2], C + VMOVPS [mh_digests_p + 64*3], D + VMOVPS [mh_digests_p + 64*4], E + VMOVPS [mh_digests_p + 64*5], F + VMOVPS [mh_digests_p + 64*6], G + VMOVPS [mh_digests_p + 64*7], H + + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_mh_sha256_block_avx512 +no_mh_sha256_block_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c new file mode 100644 index 000000000..8d9a828c6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c @@ -0,0 +1,188 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha256_internal.h" +#include <string.h> + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Base multi-hash SHA256 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// store_w is only used for step 0 ~ 15 +#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s])) +#define Ws(x, s) w[(x) & 15][s] +// update_w is used for step > 15 +#define update_w(s, i, w) \ + Ws(i, s) = Ws(i-16, s) + S0(Ws(i-15, s)) + Ws(i-7, s) + S1(Ws(i-2, s)) +#define update_t2(s, a, b, c) t2[s] = s0(a[s]) + maj(a[s],b[s],c[s]) +#define update_t1(s, h, e, f, g, i, k) \ + t1[s] = h[s] + s1(e[s]) + ch(e[s],f[s],g[s]) + k + Ws(i, s); +#define update_d(s) d[s] += t1[s] +#define update_h(s) h[s] = t1[s] + t2[s] + +// s is a iterator +#define STORE_W(s, i, w, ww) \ + for(s = 0; s < HASH_SEGS; s++) \ + store_w(s, i, w, ww); +#define UPDATE_W(s, i, w) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_w(s, i, w); +#define UPDATE_T2(s, a, b, c) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_t2(s, a, b, c); +#define UPDATE_T1(s, h, e, f, g, i, k) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_t1(s, h, e, f, g, i, k); +#define UPDATE_D(s) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_d(s); +#define UPDATE_H(s) \ + for(s = 0; s < HASH_SEGS; s++) \ + update_h(s); + +static inline void step(int i, uint32_t * a, uint32_t * b, uint32_t * c, + uint32_t * d, uint32_t * e, uint32_t * f, + uint32_t * g, uint32_t * h, uint32_t k, + uint32_t * t1, uint32_t * t2, uint32_t(*w)[HASH_SEGS], uint32_t * ww) +{ + uint8_t s; + if (i < 16) { + STORE_W(s, i, w, ww); + } else { + UPDATE_W(s, i, w); + } + UPDATE_T2(s, a, b, c); + UPDATE_T1(s, h, e, f, g, i, k); + UPDATE_D(s); + UPDATE_H(s); +} + +static inline void init_abcdefgh(uint32_t * xx, uint32_t n, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS]) +{ + uint8_t s; + for (s = 0; s < HASH_SEGS; s++) + xx[s] = digests[n][s]; +} + +static inline void add_abcdefgh(uint32_t * xx, uint32_t n, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS]) +{ + uint8_t s; + for (s = 0; s < HASH_SEGS; s++) + digests[n][s] += xx[s]; +} + +/* + * API to perform 0-64 steps of the multi-hash algorithm for + * a single block of data. The caller is responsible for ensuring + * a full block of data input. + * + * Argument: + * input - the pointer to the data + * digest - the space to hold the digests for all segments. + * + * Return: + * N/A + */ +void mh_sha256_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS], + uint8_t * frame_buffer) +{ + uint8_t i; + uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS]; + uint32_t ee[HASH_SEGS], ff[HASH_SEGS], gg[HASH_SEGS], hh[HASH_SEGS]; + uint32_t t1[HASH_SEGS], t2[HASH_SEGS]; + uint32_t *ww = (uint32_t *) input; + uint32_t(*w)[HASH_SEGS]; + + const static uint32_t k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + + w = (uint32_t(*)[HASH_SEGS]) frame_buffer; + + init_abcdefgh(aa, 0, digests); + init_abcdefgh(bb, 1, digests); + init_abcdefgh(cc, 2, digests); + init_abcdefgh(dd, 3, digests); + init_abcdefgh(ee, 4, digests); + init_abcdefgh(ff, 5, digests); + init_abcdefgh(gg, 6, digests); + init_abcdefgh(hh, 7, digests); + + for (i = 0; i < 64; i += 8) { + step(i, aa, bb, cc, dd, ee, ff, gg, hh, k[i], t1, t2, w, ww); + step(i + 1, hh, aa, bb, cc, dd, ee, ff, gg, k[i + 1], t1, t2, w, ww); + step(i + 2, gg, hh, aa, bb, cc, dd, ee, ff, k[i + 2], t1, t2, w, ww); + step(i + 3, ff, gg, hh, aa, bb, cc, dd, ee, k[i + 3], t1, t2, w, ww); + step(i + 4, ee, ff, gg, hh, aa, bb, cc, dd, k[i + 4], t1, t2, w, ww); + step(i + 5, dd, ee, ff, gg, hh, aa, bb, cc, k[i + 5], t1, t2, w, ww); + step(i + 6, cc, dd, ee, ff, gg, hh, aa, bb, k[i + 6], t1, t2, w, ww); + step(i + 7, bb, cc, dd, ee, ff, gg, hh, aa, k[i + 7], t1, t2, w, ww); + } + + add_abcdefgh(aa, 0, digests); + add_abcdefgh(bb, 1, digests); + add_abcdefgh(cc, 2, digests); + add_abcdefgh(dd, 3, digests); + add_abcdefgh(ee, 4, digests); + add_abcdefgh(ff, 5, digests); + add_abcdefgh(gg, 6, digests); + add_abcdefgh(hh, 7, digests); +} + +void mh_sha256_block_base(const uint8_t * input_data, + uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i; + + for (i = 0; i < num_blocks; i++) { + mh_sha256_single(input_data, digests, frame_buffer); + input_data += MH_SHA256_BLOCK_SIZE; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm new file mode 100644 index 000000000..b1d6fd9ea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm @@ -0,0 +1,557 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; code to compute 16 SHA256 using SSE +;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + + %define arg4 r8 + %define arg5 r9 + + %define tmp1 r10 + %define tmp2 r11 + %define tmp3 r12 ; must be saved and restored + %define tmp4 r13 ; must be saved and restored + %define tmp5 r14 ; must be saved and restored + %define tmp6 r15 ; must be saved and restored + %define return rax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 + %endmacro + %macro FUNC_RESTORE 0 + pop r15 + pop r14 + pop r13 + pop r12 + %endmacro +%else + ; Windows + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + + %define arg4 r10 + %define arg5 r11 + %define tmp1 r12 ; must be saved and restored + %define tmp2 r13 ; must be saved and restored + %define tmp3 r14 ; must be saved and restored + %define tmp4 r15 ; must be saved and restored + %define tmp5 rdi ; must be saved and restored + %define tmp6 rsi ; must be saved and restored + %define return rax + + %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + save_reg rsi, 10*16 + 5*8 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + mov rsi, [rsp + 10*16 + 5*8] + add rsp, stack_size + %endmacro +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define loops arg3 +;variables of mh_sha256 +%define mh_in_p arg0 +%define mh_digests_p arg1 +%define mh_data_p arg2 +%define mh_segs tmp1 +;variables used by storing segs_digests on stack +%define RSP_SAVE tmp2 +%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS + +; Common definitions +%define ROUND tmp4 +%define TBL tmp5 + +%define pref tmp3 +%macro PREFETCH_X 1 +%define %%mem %1 + prefetchnta %%mem +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define MOVPS movups + +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_R 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa %%T1,[SZ4*(%%i&0xf) + %%data] + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15_W 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + %%data], %%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 3 +%define %%T1 %1 +%define %%i %2 +%define %%data %3 + + movdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data] + movdqa a1, [SZ4*((%%i-2)&0xf) + %%data] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + %%data] + paddd a1, [SZ4*((%%i-7)&0xf) + %%data] + paddd %%T1, a1 + + ROUND_00_15_W %%T1, %%i, %%data + +%endm + +;init hash digests +; segs_digests:low addr-> high_addr +; a | b | c | ...| p | (16) +; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap | +; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp | +; .... +; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp | + +align 32 + +;void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], +; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); +; arg 0 pointer to input data +; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8]) +; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data. +; arg 3 number of 1KB blocks +; +mk_global mh_sha256_block_sse, function, internal +func(mh_sha256_block_sse) + endbranch + FUNC_SAVE + ; save rsp + mov RSP_SAVE, rsp + + cmp loops, 0 + jle .return + + ; leave enough space to store segs_digests + sub rsp, FRAMESZ + ; align rsp to 16 Bytes needed by sse + and rsp, ~0x0F + lea TBL,[TABLE] + + %assign I 0 ; copy segs_digests into stack + %rep 8 + MOVPS a, [mh_digests_p + I*64 + 16*0] + MOVPS b, [mh_digests_p + I*64 + 16*1] + MOVPS c, [mh_digests_p + I*64 + 16*2] + MOVPS d, [mh_digests_p + I*64 + 16*3] + + movdqa [rsp + I*64 + 16*0], a + movdqa [rsp + I*64 + 16*1], b + movdqa [rsp + I*64 + 16*2], c + movdqa [rsp + I*64 + 16*3], d + %assign I (I+1) + %endrep + +.block_loop: + ;transform to big-endian data and store on aligned_frame + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4 + %assign I 0 + %rep 16 + MOVPS TT0,[mh_in_p + I*64+0*16] + MOVPS TT1,[mh_in_p + I*64+1*16] + MOVPS TT2,[mh_in_p + I*64+2*16] + MOVPS TT3,[mh_in_p + I*64+3*16] + + pshufb TT0, TMP + movdqa [mh_data_p +(I)*16 +0*256],TT0 + pshufb TT1, TMP + movdqa [mh_data_p +(I)*16 +1*256],TT1 + pshufb TT2, TMP + movdqa [mh_data_p +(I)*16 +2*256],TT2 + pshufb TT3, TMP + movdqa [mh_data_p +(I)*16 +3*256],TT3 + %assign I (I+1) + %endrep + + mov mh_segs, 0 ;start from the first 4 segments + mov pref, 1024 ;avoid prefetch repeadtedly + .segs_loop: + xor ROUND, ROUND + ;; Initialize digests + movdqa a, [rsp + 0*64 + mh_segs] + movdqa b, [rsp + 1*64 + mh_segs] + movdqa c, [rsp + 2*64 + mh_segs] + movdqa d, [rsp + 3*64 + mh_segs] + movdqa e, [rsp + 4*64 + mh_segs] + movdqa f, [rsp + 5*64 + mh_segs] + movdqa g, [rsp + 6*64 + mh_segs] + movdqa h, [rsp + 7*64 + mh_segs] + + %assign i 0 + %rep 4 + ROUND_00_15_R TT0, (i*4+0), mh_data_p + ROUND_00_15_R TT1, (i*4+1), mh_data_p + ROUND_00_15_R TT2, (i*4+2), mh_data_p + ROUND_00_15_R TT3, (i*4+3), mh_data_p + %assign i (i+1) + %endrep + PREFETCH_X [mh_in_p + pref+128*0] + + %assign i 16 + %rep 48 + %if i = 48 + PREFETCH_X [mh_in_p + pref+128*1] + %endif + ROUND_16_XX T1, i, mh_data_p + %assign i (i+1) + %endrep + + ;; add old digest + paddd a, [rsp + 0*64 + mh_segs] + paddd b, [rsp + 1*64 + mh_segs] + paddd c, [rsp + 2*64 + mh_segs] + paddd d, [rsp + 3*64 + mh_segs] + paddd e, [rsp + 4*64 + mh_segs] + paddd f, [rsp + 5*64 + mh_segs] + paddd g, [rsp + 6*64 + mh_segs] + paddd h, [rsp + 7*64 + mh_segs] + + ; write out digests + movdqa [rsp + 0*64 + mh_segs], a + movdqa [rsp + 1*64 + mh_segs], b + movdqa [rsp + 2*64 + mh_segs], c + movdqa [rsp + 3*64 + mh_segs], d + movdqa [rsp + 4*64 + mh_segs], e + movdqa [rsp + 5*64 + mh_segs], f + movdqa [rsp + 6*64 + mh_segs], g + movdqa [rsp + 7*64 + mh_segs], h + + add pref, 256 + add mh_data_p, 256 + add mh_segs, 16 + cmp mh_segs, 64 + jc .segs_loop + + sub mh_data_p, (1024) + add mh_in_p, (1024) + sub loops, 1 + jne .block_loop + + %assign I 0 ; copy segs_digests back to mh_digests_p + %rep 8 + movdqa a, [rsp + I*64 + 16*0] + movdqa b, [rsp + I*64 + 16*1] + movdqa c, [rsp + I*64 + 16*2] + movdqa d, [rsp + I*64 + 16*3] + + MOVPS [mh_digests_p + I*64 + 16*0], a + MOVPS [mh_digests_p + I*64 + 16*1], b + MOVPS [mh_digests_p + I*64 + 16*2], c + MOVPS [mh_digests_p + I*64 + 16*3], d + %assign I (I+1) + %endrep + mov rsp, RSP_SAVE ; restore rsp + +.return: + FUNC_RESTORE + ret + +endproc_frame + +section .data align=16 + +align 16 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c new file mode 100644 index 000000000..6abb20688 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c @@ -0,0 +1,121 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha256_finalize_base.c contains the prototypes of mh_sha256_finalize_XXX + * and mh_sha256_tail_XXX. Default definitions are base type which generates + * mh_sha256_finalize_base and mh_sha256_tail_base. Other types are generated + * through different predefined macros by mh_sha256.c. + * mh_sha256_tail is used to calculate the last incomplete block of input + * data. mh_sha256_finalize is the mh_sha256_ctx wrapper of mh_sha256_tail. + */ +#ifndef MH_SHA256_FINALIZE_FUNCTION +#include <string.h> +#include "mh_sha256_internal.h" + +#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_base +#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_base +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base +#define MH_SHA256_FINALIZE_SLVER +#endif + +void MH_SHA256_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t * frame_buffer, uint32_t digests[SHA256_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA256_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) { + MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, + 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit; + MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA256 digests (segment digests as input message) + sha256_for_mh_sha256((uint8_t *) mh_sha256_segs_digests, digests, + 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +int MH_SHA256_FINALIZE_FUNCTION(struct mh_sha256_ctx *ctx, void *mh_sha256_digest) +{ + uint8_t i; + uint8_t *partial_block_buffer; + uint64_t total_len; + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + total_len = ctx->total_length; + partial_block_buffer = ctx->partial_block_buffer; + + /* mh_sha256 tail */ + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + + MH_SHA256_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha256_segs_digests, + aligned_frame_buffer, ctx->mh_sha256_digest); + + /* Output the digests of mh_sha256 */ + if (mh_sha256_digest != NULL) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) + ((uint32_t *) mh_sha256_digest)[i] = ctx->mh_sha256_digest[i]; + } + + return MH_SHA256_CTX_ERROR_NONE; +} + +#ifdef MH_SHA256_FINALIZE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha256_finalize_base_slver_000002bb; +struct slver mh_sha256_finalize_base_slver = { 0x02bb, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h new file mode 100644 index 000000000..8051e3f36 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h @@ -0,0 +1,318 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef _MH_SHA256_INTERNAL_H_ +#define _MH_SHA256_INTERNAL_H_ + +/** + * @file mh_sha256_internal.h + * @brief mh_sha256 internal function prototypes and macros + * + * Interface for mh_sha256 internal functions + * + */ +#include <stdint.h> +#include "mh_sha256.h" +#include "endian_helper.h" + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef _MSC_VER +# define inline __inline +#endif + + // 64byte pointer align +#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) ) + + /******************************************************************* + *mh_sha256 constants and macros + ******************************************************************/ + /* mh_sha256 constants */ +#define MH_SHA256_H0 0x6a09e667UL +#define MH_SHA256_H1 0xbb67ae85UL +#define MH_SHA256_H2 0x3c6ef372UL +#define MH_SHA256_H3 0xa54ff53aUL +#define MH_SHA256_H4 0x510e527fUL +#define MH_SHA256_H5 0x9b05688cUL +#define MH_SHA256_H6 0x1f83d9abUL +#define MH_SHA256_H7 0x5be0cd19UL + + /* mh_sha256 macros */ +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + + /******************************************************************* + * SHA256 API internal function prototypes + ******************************************************************/ + + /** + * @brief Performs complete SHA256 algorithm. + * + * @param input Pointer to buffer containing the input message. + * @param digest Pointer to digest to update. + * @param len Length of buffer. + * @returns None + */ + void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len); + + /** + * @brief Calculate sha256 digest of blocks which size is SHA256_BLOCK_SIZE + * + * @param data Pointer to data buffer containing the input message. + * @param digest Pointer to sha256 digest. + * @returns None + */ + void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]); + + /******************************************************************* + * mh_sha256 API internal function prototypes + * Multiple versions of Update and Finalize functions are supplied which use + * multiple versions of block and tail process subfunctions. + ******************************************************************/ + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @returns none + * + */ + void mh_sha256_tail(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_base(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires SSE + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_sse(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX2 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx2(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Tail process for multi-hash sha256. + * + * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE. + * It will output the final SHA256 digest based on mh_sha256_segs_digests. + * + * @requires AVX512 + * + * @param partial_buffer Pointer to the start addr of remainder + * @param total_len The total length of all sections of input data. + * @param mh_sha256_segs_digests The digests of all 16 segments . + * @param frame_buffer Pointer to buffer which is a temp working area + * @param mh_sha256_digest mh_sha256 digest + * @returns none + * + */ + void mh_sha256_tail_avx512(uint8_t *partial_buffer, uint32_t total_len, + uint32_t (*mh_sha256_segs_digests)[HASH_SEGS], + uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_base(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires SSE + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX2 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + + /** + * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N. + * + * @requires AVX512 + * + * @param input_data Pointer to input data to be processed + * @param digests 16 segments digests + * @param frame_buffer Pointer to buffer which is a temp working area + * @param num_blocks The number of blocks. + * @returns none + * + */ + void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm new file mode 100644 index 000000000..e14fc7eb1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm @@ -0,0 +1,77 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +%include "reg_sizes.asm" +%include "multibinary.asm" + +%ifidn __OUTPUT_FORMAT__, elf32 + [bits 32] +%else + default rel + [bits 64] + + extern mh_sha256_update_sse + extern mh_sha256_update_avx + extern mh_sha256_update_avx2 + extern mh_sha256_finalize_sse + extern mh_sha256_finalize_avx + extern mh_sha256_finalize_avx2 + + %ifdef HAVE_AS_KNOWS_AVX512 + extern mh_sha256_update_avx512 + extern mh_sha256_finalize_avx512 + %endif + +%endif + +extern mh_sha256_update_base +extern mh_sha256_finalize_base + +mbin_interface mh_sha256_update +mbin_interface mh_sha256_finalize + +%ifidn __OUTPUT_FORMAT__, elf64 + + %ifdef HAVE_AS_KNOWS_AVX512 + mbin_dispatch_init6 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2, mh_sha256_update_avx512 + mbin_dispatch_init6 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2, mh_sha256_finalize_avx512 + %else + mbin_dispatch_init5 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2 + mbin_dispatch_init5 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2 + %endif + +%else + mbin_dispatch_init2 mh_sha256_update, mh_sha256_update_base + mbin_dispatch_init2 mh_sha256_finalize, mh_sha256_finalize_base +%endif + +;;; func core, ver, snum +slversion mh_sha256_update, 00, 00, 02b2 +slversion mh_sha256_finalize, 00, 00, 02b3 diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c new file mode 100644 index 000000000..8095e4f05 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c @@ -0,0 +1,180 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "mh_sha256.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_LEN 16*1024 +# define TEST_LOOPS 20000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_LEN 16*1024*1024 +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#define TEST_MEM TEST_LEN + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + if (i % 32 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_base[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_base[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("base: "); + dump((char *)hash_base, 32); + printf("ref: "); + dump((char *)hash_test, 32); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int i, fail = 0; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_base[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + struct mh_sha256_ctx *update_ctx_test = NULL, *update_ctx_base = NULL; + struct perf start, stop; + + printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n"); + + buff = malloc(TEST_LEN); + update_ctx_test = malloc(sizeof(*update_ctx_test)); + update_ctx_base = malloc(sizeof(*update_ctx_base)); + + if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + // mh_sha256 base version + mh_sha256_init(update_ctx_base); + mh_sha256_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha256_finalize_base(update_ctx_base, hash_base); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS / 10; i++) { + mh_sha256_init(update_ctx_base); + mh_sha256_update_base(update_ctx_base, buff, TEST_LEN); + mh_sha256_finalize_base(update_ctx_base, hash_base); + } + perf_stop(&stop); + printf("mh_sha256_update_base" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + //Update feature test + CHECK_RETURN(mh_sha256_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + CHECK_RETURN(mh_sha256_init(update_ctx_test)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test)); + } + perf_stop(&stop); + printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + // Check results + fail = compare_digests(hash_base, hash_test); + + if (fail) { + printf("Fail size=%d\n", TEST_LEN); + return -1; + } + + if (fail) + printf("Test failed function test%d\n", fail); + else + printf("Pass func check\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c new file mode 100644 index 000000000..2aaefecb0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c @@ -0,0 +1,410 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "mh_sha256_internal.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + // Macros and sub-functions which already exist in source code file + // (sha256_for_mh_sha256.c) is part of ISA-L library as internal functions. + // The reason why writing them twice is the linking issue caused by + // mh_sha256_ref(). mh_sha256_ref() needs these macros and sub-functions + // without linking ISA-L library. So mh_sha256_ref() includes them in + // order to contain essential sub-functions in its own object file. +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha256_single_for_mh_sha256_ref(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +void sha256_for_mh_sha256_ref(const uint8_t * input_data, uint32_t * digest, + const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = MH_SHA256_H0; + digest[1] = MH_SHA256_H1; + digest[2] = MH_SHA256_H2; + digest[3] = MH_SHA256_H3; + digest[4] = MH_SHA256_H4; + digest[5] = MH_SHA256_H5; + digest[6] = MH_SHA256_H6; + digest[7] = MH_SHA256_H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single_for_mh_sha256_ref(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - 8) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single_for_mh_sha256_ref(buf, digest); + if (i == (2 * SHA256_BLOCK_SIZE)) + sha256_single_for_mh_sha256_ref(buf + SHA256_BLOCK_SIZE, digest); +} + +/* + * buffer to rearrange one segment data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * w0 | w1 | ... | w15 + * + */ +static inline void transform_input_single(uint32_t * new_data, uint32_t * input, + uint32_t segment) +{ + new_data[16 * segment + 0] = input[16 * 0 + segment]; + new_data[16 * segment + 1] = input[16 * 1 + segment]; + new_data[16 * segment + 2] = input[16 * 2 + segment]; + new_data[16 * segment + 3] = input[16 * 3 + segment]; + new_data[16 * segment + 4] = input[16 * 4 + segment]; + new_data[16 * segment + 5] = input[16 * 5 + segment]; + new_data[16 * segment + 6] = input[16 * 6 + segment]; + new_data[16 * segment + 7] = input[16 * 7 + segment]; + new_data[16 * segment + 8] = input[16 * 8 + segment]; + new_data[16 * segment + 9] = input[16 * 9 + segment]; + new_data[16 * segment + 10] = input[16 * 10 + segment]; + new_data[16 * segment + 11] = input[16 * 11 + segment]; + new_data[16 * segment + 12] = input[16 * 12 + segment]; + new_data[16 * segment + 13] = input[16 * 13 + segment]; + new_data[16 * segment + 14] = input[16 * 14 + segment]; + new_data[16 * segment + 15] = input[16 * 15 + segment]; +} + +// Adapt parameters to sha256_single_for_mh_sha256_ref +#define sha256_update_one_seg(data, digest) \ + sha256_single_for_mh_sha256_ref((const uint8_t *)(data), (uint32_t *)(digest)) + +/* + * buffer to Rearrange all segments data from one block. + * + * Layout of new_data: + * segment + * ------------------------- + * seg0: | w0 | w1 | ... | w15 + * seg1: | w0 | w1 | ... | w15 + * seg2: | w0 | w1 | ... | w15 + * .... + * seg15: | w0 | w1 | ... | w15 + * + */ +static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block) +{ + uint32_t *current_input = input + block * MH_SHA256_BLOCK_SIZE / 4; + + transform_input_single(new_data, current_input, 0); + transform_input_single(new_data, current_input, 1); + transform_input_single(new_data, current_input, 2); + transform_input_single(new_data, current_input, 3); + transform_input_single(new_data, current_input, 4); + transform_input_single(new_data, current_input, 5); + transform_input_single(new_data, current_input, 6); + transform_input_single(new_data, current_input, 7); + transform_input_single(new_data, current_input, 8); + transform_input_single(new_data, current_input, 9); + transform_input_single(new_data, current_input, 10); + transform_input_single(new_data, current_input, 11); + transform_input_single(new_data, current_input, 12); + transform_input_single(new_data, current_input, 13); + transform_input_single(new_data, current_input, 14); + transform_input_single(new_data, current_input, 15); + +} + +/* + * buffer to Calculate all segments' digests from one block. + * + * Layout of seg_digest: + * segment + * ------------------------- + * seg0: | H0 | H1 | ... | H7 + * seg1: | H0 | H1 | ... | H7 + * seg2: | H0 | H1 | ... | H7 + * .... + * seg15: | H0 | H1 | ... | H7 + * + */ +static inline void sha256_update_all_segs(uint32_t * new_data, uint32_t(*mh_sha256_seg_digests) + [SHA256_DIGEST_WORDS]) +{ + sha256_update_one_seg(&(new_data)[16 * 0], mh_sha256_seg_digests[0]); + sha256_update_one_seg(&(new_data)[16 * 1], mh_sha256_seg_digests[1]); + sha256_update_one_seg(&(new_data)[16 * 2], mh_sha256_seg_digests[2]); + sha256_update_one_seg(&(new_data)[16 * 3], mh_sha256_seg_digests[3]); + sha256_update_one_seg(&(new_data)[16 * 4], mh_sha256_seg_digests[4]); + sha256_update_one_seg(&(new_data)[16 * 5], mh_sha256_seg_digests[5]); + sha256_update_one_seg(&(new_data)[16 * 6], mh_sha256_seg_digests[6]); + sha256_update_one_seg(&(new_data)[16 * 7], mh_sha256_seg_digests[7]); + sha256_update_one_seg(&(new_data)[16 * 8], mh_sha256_seg_digests[8]); + sha256_update_one_seg(&(new_data)[16 * 9], mh_sha256_seg_digests[9]); + sha256_update_one_seg(&(new_data)[16 * 10], mh_sha256_seg_digests[10]); + sha256_update_one_seg(&(new_data)[16 * 11], mh_sha256_seg_digests[11]); + sha256_update_one_seg(&(new_data)[16 * 12], mh_sha256_seg_digests[12]); + sha256_update_one_seg(&(new_data)[16 * 13], mh_sha256_seg_digests[13]); + sha256_update_one_seg(&(new_data)[16 * 14], mh_sha256_seg_digests[14]); + sha256_update_one_seg(&(new_data)[16 * 15], mh_sha256_seg_digests[15]); +} + +void mh_sha256_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS], + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks) +{ + uint32_t i, j; + uint32_t *temp_buffer = (uint32_t *) frame_buffer; + uint32_t(*trans_digests)[SHA256_DIGEST_WORDS]; + + trans_digests = (uint32_t(*)[SHA256_DIGEST_WORDS]) digests; + + // Re-structure seg_digests from 5*16 to 16*5 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + temp_buffer[j * SHA256_DIGEST_WORDS + i] = digests[i][j]; + } + } + memcpy(trans_digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + // Calculate digests for all segments, leveraging sha256 API + for (i = 0; i < num_blocks; i++) { + transform_input(temp_buffer, (uint32_t *) input_data, i); + sha256_update_all_segs(temp_buffer, trans_digests); + } + + // Re-structure seg_digests from 16*5 to 5*16 + for (j = 0; j < HASH_SEGS; j++) { + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i]; + } + } + memcpy(digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha256_tail_ref(uint8_t * partial_buffer, uint32_t total_len, + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], uint8_t * frame_buffer, + uint32_t digests[SHA256_DIGEST_WORDS]) +{ + uint64_t partial_buffer_len, len_in_bit; + + partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE; + + // Padding the first block + partial_buffer[partial_buffer_len] = 0x80; + partial_buffer_len++; + memset(partial_buffer + partial_buffer_len, 0, + MH_SHA256_BLOCK_SIZE - partial_buffer_len); + + // Calculate the first block without total_length if padding needs 2 block + if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) { + mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + //Padding the second block + memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + //Padding the block + len_in_bit = to_be64((uint64_t) total_len * 8); + *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit; + mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1); + + //Calculate multi-hash SHA256 digests (segment digests as input message) + sha256_for_mh_sha256_ref((uint8_t *) mh_sha256_segs_digests, digests, + 4 * SHA256_DIGEST_WORDS * HASH_SEGS); + + return; +} + +void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest) +{ + uint64_t total_len; + uint64_t num_blocks; + uint32_t mh_sha256_segs_digests[SHA256_DIGEST_WORDS][HASH_SEGS]; + uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE]; + uint8_t partial_block_buffer[MH_SHA256_BLOCK_SIZE * 2]; + uint32_t mh_sha256_hash_dword[SHA256_DIGEST_WORDS]; + uint32_t i; + const uint8_t *input_data = (const uint8_t *)buffer; + + /* Initialize digests of all segments */ + for (i = 0; i < HASH_SEGS; i++) { + mh_sha256_segs_digests[0][i] = MH_SHA256_H0; + mh_sha256_segs_digests[1][i] = MH_SHA256_H1; + mh_sha256_segs_digests[2][i] = MH_SHA256_H2; + mh_sha256_segs_digests[3][i] = MH_SHA256_H3; + mh_sha256_segs_digests[4][i] = MH_SHA256_H4; + mh_sha256_segs_digests[5][i] = MH_SHA256_H5; + mh_sha256_segs_digests[6][i] = MH_SHA256_H6; + mh_sha256_segs_digests[7][i] = MH_SHA256_H7; + } + + total_len = len; + + // Calculate blocks + num_blocks = len / MH_SHA256_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + mh_sha256_block_ref(input_data, mh_sha256_segs_digests, frame_buffer, + num_blocks); + len -= num_blocks * MH_SHA256_BLOCK_SIZE; + input_data += num_blocks * MH_SHA256_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + /* Finalize */ + mh_sha256_tail_ref(partial_block_buffer, total_len, mh_sha256_segs_digests, + frame_buffer, mh_sha256_hash_dword); + + // Output the digests of mh_sha256 + if (mh_sha256_digest != NULL) { + mh_sha256_digest[0] = mh_sha256_hash_dword[0]; + mh_sha256_digest[1] = mh_sha256_hash_dword[1]; + mh_sha256_digest[2] = mh_sha256_hash_dword[2]; + mh_sha256_digest[3] = mh_sha256_hash_dword[3]; + mh_sha256_digest[4] = mh_sha256_hash_dword[4]; + mh_sha256_digest[5] = mh_sha256_hash_dword[5]; + mh_sha256_digest[6] = mh_sha256_hash_dword[6]; + mh_sha256_digest[7] = mh_sha256_hash_dword[7]; + } + + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c new file mode 100644 index 000000000..13ab91c16 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c @@ -0,0 +1,217 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "mh_sha256.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest); +#define MH_SHA256_REF mh_sha256_ref + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + if (i % 32 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 32); + printf("test: "); + dump((char *)hash_test, 32); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + int size, offset; + struct mh_sha256_ctx *update_ctx = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + MH_SHA256_REF(buff, TEST_LEN, hash_ref); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages + for (size = TEST_LEN; size >= 0; size--) { + + // Fill with rand data + rand_buffer(buff, size); + + MH_SHA256_REF(buff, size, hash_ref); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various buffer offsets and sizes + printf("offset tests"); + for (size = TEST_LEN - 256; size > 256; size -= 11) { + for (offset = 0; offset < 256; offset++) { + MH_SHA256_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + } + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + // Run efence tests + printf("efence tests"); + for (size = TEST_SIZE; size > 0; size--) { + offset = TEST_LEN - size; + + MH_SHA256_REF(buff + offset, size, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size=%d\n", size); + return -1; + } + + if ((size & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + printf(" %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c new file mode 100644 index 000000000..024ae2b91 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c @@ -0,0 +1,110 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * mh_sha256_update_base.c contains the prototype of mh_sha256_update_XXX. + * Default definitions are base type which generates mh_sha256_update_base. + * Other types are generated through different predefined macros by mh_sha256.c. + */ +#ifndef MH_SHA256_UPDATE_FUNCTION +#include "mh_sha256_internal.h" +#include <string.h> + +#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_base +#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base +#define MH_SHA256_UPDATE_SLVER +#endif + +int MH_SHA256_UPDATE_FUNCTION(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len) +{ + + uint8_t *partial_block_buffer; + uint64_t partial_block_len; + uint64_t num_blocks; + uint32_t(*mh_sha256_segs_digests)[HASH_SEGS]; + uint8_t *aligned_frame_buffer; + const uint8_t *input_data = (const uint8_t *)buffer; + + if (ctx == NULL) + return MH_SHA256_CTX_ERROR_NULL; + + if (len == 0) + return MH_SHA256_CTX_ERROR_NONE; + + partial_block_len = ctx->total_length % MH_SHA256_BLOCK_SIZE; + partial_block_buffer = ctx->partial_block_buffer; + aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); + mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests; + + ctx->total_length += len; + // No enough input data for mh_sha256 calculation + if (len + partial_block_len < MH_SHA256_BLOCK_SIZE) { + memcpy(partial_block_buffer + partial_block_len, input_data, len); + return MH_SHA256_CTX_ERROR_NONE; + } + // mh_sha256 calculation for the previous partial block + if (partial_block_len != 0) { + memcpy(partial_block_buffer + partial_block_len, input_data, + MH_SHA256_BLOCK_SIZE - partial_block_len); + //do one_block process + MH_SHA256_BLOCK_FUNCTION(partial_block_buffer, mh_sha256_segs_digests, + aligned_frame_buffer, 1); + input_data += MH_SHA256_BLOCK_SIZE - partial_block_len; + len -= MH_SHA256_BLOCK_SIZE - partial_block_len; + memset(partial_block_buffer, 0, MH_SHA256_BLOCK_SIZE); + } + // Calculate mh_sha256 for the current blocks + num_blocks = len / MH_SHA256_BLOCK_SIZE; + if (num_blocks > 0) { + //do num_blocks process + MH_SHA256_BLOCK_FUNCTION(input_data, mh_sha256_segs_digests, + aligned_frame_buffer, num_blocks); + len -= num_blocks * MH_SHA256_BLOCK_SIZE; + input_data += num_blocks * MH_SHA256_BLOCK_SIZE; + } + // Store the partial block + if (len != 0) { + memcpy(partial_block_buffer, input_data, len); + } + + return MH_SHA256_CTX_ERROR_NONE; + +} + +#ifdef MH_SHA256_UPDATE_SLVER +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver mh_sha256_update_base_slver_000002ba; +struct slver mh_sha256_update_base_slver = { 0x02ba, 0x00, 0x00 }; +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c new file mode 100644 index 000000000..f5b28bba7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c @@ -0,0 +1,240 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "mh_sha256.h" + +#define TEST_LEN 16*1024 +#define TEST_SIZE 8*1024 +#define TEST_MEM TEST_LEN +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define str(s) #s +#define xstr(s) str(s) + +#define _FUNC_TOKEN(func, type) func##type +#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type) + +#ifndef MH_SHA256_FUNC_TYPE +#define MH_SHA256_FUNC_TYPE +#endif + +#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE) +#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE) + +#define CHECK_RETURN(state) do{ \ + if((state) != MH_SHA256_CTX_ERROR_NONE){ \ + printf("The mh_sha256 function is failed.\n"); \ + return 1; \ + } \ + }while(0) + +extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest); + +// Generates pseudo-random data +void rand_buffer(uint8_t * buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +void dump(char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 20 == 0) + printf("\n"); + } + if (i % 20 != 0) + printf("\n"); +} + +int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS], + uint32_t hash_test[SHA256_DIGEST_WORDS]) +{ + int i; + int mh_sha256_fail = 0; + + for (i = 0; i < SHA256_DIGEST_WORDS; i++) { + if (hash_test[i] != hash_ref[i]) + mh_sha256_fail++; + } + + if (mh_sha256_fail) { + printf("mh_sha256 fail test\n"); + printf("ref: "); + dump((char *)hash_ref, 20); + printf("test: "); + dump((char *)hash_test, 20); + } + + return mh_sha256_fail; +} + +int main(int argc, char *argv[]) +{ + int fail = 0, i; + uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS]; + uint8_t *buff = NULL; + int update_count; + int size1, size2, offset, addr_offset; + struct mh_sha256_ctx *update_ctx = NULL; + uint8_t *mem_addr = NULL; + + printf(xstr(TEST_UPDATE_FUNCTION) "_test:"); + + srand(TEST_SEED); + + buff = malloc(TEST_LEN); + update_ctx = malloc(sizeof(*update_ctx)); + + if (buff == NULL || update_ctx == NULL) { + printf("malloc failed test aborted\n"); + return -1; + } + // Rand test1 + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("fail rand1 test\n"); + return -1; + } else + putchar('.'); + + // Test various size messages by update twice. + printf("\n various size messages by update twice tests"); + for (size1 = TEST_LEN; size1 >= 0; size1--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size2 = TEST_LEN - size1; // size2 is different with the former + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // Test various update count + printf("\n various update count tests"); + for (update_count = 1; update_count <= TEST_LEN; update_count++) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // subsequent update + size1 = TEST_LEN / update_count; + size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former + + CHECK_RETURN(mh_sha256_init(update_ctx)); + for (i = 1, offset = 0; i < update_count; i++) { + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1)); + offset += size1; + } + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail size1=%d\n", size1); + return -1; + } + + if ((size2 & 0xff) == 0) { + putchar('.'); + fflush(0); + } + } + + // test various start address of ctx. + printf("\n various start address of ctx test"); + free(update_ctx); + mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10); + for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) { + + // Fill with rand data + rand_buffer(buff, TEST_LEN); + + mh_sha256_ref(buff, TEST_LEN, hash_ref); + + // a unaligned offset + update_ctx = (struct mh_sha256_ctx *)(mem_addr + addr_offset); + CHECK_RETURN(mh_sha256_init(update_ctx)); + CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN)); + CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test)); + + fail = compare_digests(hash_ref, hash_test); + + if (fail) { + printf("Fail addr_offset=%d\n", addr_offset); + return -1; + } + + if ((addr_offset & 0xf) == 0) { + putchar('.'); + fflush(0); + } + } + + printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail"); + + return fail; + +} diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c new file mode 100644 index 000000000..ea8c9f436 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c @@ -0,0 +1,176 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "mh_sha256_internal.h" +#include <string.h> + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA256 Functions for mh_sha256 +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = MH_SHA256_H0; + digest[1] = MH_SHA256_H1; + digest[2] = MH_SHA256_H2; + digest[3] = MH_SHA256_H3; + digest[4] = MH_SHA256_H4; + digest[5] = MH_SHA256_H5; + digest[6] = MH_SHA256_H6; + digest[7] = MH_SHA256_H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single_for_mh_sha256(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - 8) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single_for_mh_sha256(buf, digest); + if (i == (2 * SHA256_BLOCK_SIZE)) + sha256_single_for_mh_sha256(buf + SHA256_BLOCK_SIZE, digest); +} |