diff options
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/sha256_mb')
49 files changed, 12711 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am new file mode 100644 index 000000000..9405c2469 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am @@ -0,0 +1,127 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_x86_64 += sha256_mb/sha256_ctx_sse.c \ + sha256_mb/sha256_ctx_avx.c \ + sha256_mb/sha256_ctx_avx2.c \ + sha256_mb/sha256_ctx_base.c + +lsrc_x86_64 += sha256_mb/sha256_mb_mgr_init_sse.c \ + sha256_mb/sha256_mb_mgr_init_avx2.c + + +lsrc_x86_64 += sha256_mb/sha256_mb_mgr_submit_sse.asm \ + sha256_mb/sha256_mb_mgr_submit_avx.asm \ + sha256_mb/sha256_mb_mgr_submit_avx2.asm \ + sha256_mb/sha256_mb_mgr_flush_sse.asm \ + sha256_mb/sha256_mb_mgr_flush_avx.asm \ + sha256_mb/sha256_mb_mgr_flush_avx2.asm \ + sha256_mb/sha256_mb_x4_sse.asm \ + sha256_mb/sha256_mb_x4_avx.asm \ + sha256_mb/sha256_mb_x8_avx2.asm \ + sha256_mb/sha256_multibinary.asm + +lsrc_x86_64 += sha256_mb/sha256_ctx_avx512.c \ + sha256_mb/sha256_mb_mgr_init_avx512.c \ + sha256_mb/sha256_mb_mgr_submit_avx512.asm \ + sha256_mb/sha256_mb_mgr_flush_avx512.asm \ + sha256_mb/sha256_mb_x16_avx512.asm + +lsrc_x86_64 += sha256_mb/sha256_opt_x1.asm + +lsrc_x86_64 += sha256_mb/sha256_ni_x1.asm \ + sha256_mb/sha256_ni_x2.asm \ + sha256_mb/sha256_ctx_sse_ni.c \ + sha256_mb/sha256_ctx_avx512_ni.c \ + sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \ + sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \ + sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \ + sha256_mb/sha256_ref.c + +lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \ + sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c \ + sha256_mb/aarch64/sha256_ctx_ce.c \ + sha256_mb/aarch64/sha256_mb_mgr_ce.c \ + sha256_mb/aarch64/sha256_mb_x1_ce.S \ + sha256_mb/aarch64/sha256_mb_x2_ce.S \ + sha256_mb/aarch64/sha256_mb_x3_ce.S \ + sha256_mb/aarch64/sha256_mb_x4_ce.S + + +lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c \ + sha256_mb/sha256_ctx_base.c \ + sha256_mb/sha256_ref.c + +src_include += -I $(srcdir)/sha256_mb + +extern_hdrs += include/sha256_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + include/multibinary.asm \ + sha256_mb/sha256_job.asm \ + sha256_mb/sha256_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha256_mb/sha256_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha256_mb/sha256_mb_test \ + sha256_mb/sha256_mb_rand_test \ + sha256_mb/sha256_mb_rand_update_test \ + sha256_mb/sha256_mb_flush_test + +unit_tests += sha256_mb/sha256_mb_rand_ssl_test + +perf_tests += sha256_mb/sha256_mb_vs_ossl_perf \ + sha256_mb/sha256_mb_vs_ossl_shortage_perf + +sha256_mb_rand_ssl_test: sha256_ref.o +sha256_mb_rand_test: sha256_ref.o +sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_rand_update_test: sha256_ref.o +sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_flush_test: sha256_ref.o +sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la + +sha256_mb_rand_ssl_test: LDLIBS += -lcrypto +sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto + +sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto +sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c new file mode 100644 index 000000000..4776f55bd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state); +SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job); +SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state); +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_ce(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_fixedlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_fixedlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_ce_slver_02020142; +struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_submit_ce_slver_02020143; +struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_flush_ce_slver_02020144; +struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c new file mode 100644 index 000000000..8627991c3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c @@ -0,0 +1,59 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_submit_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_submit); + +} + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_init_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_init); + +} + +DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if (auxval & HWCAP_SHA2) + return PROVIDER_INFO(sha256_ctx_mgr_flush_ce); + + return PROVIDER_BASIC(sha256_ctx_mgr_flush); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c new file mode 100644 index 000000000..aa63c4dd8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stddef.h> +#include <sha256_mb.h> +#include <assert.h> + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#define SHA256_MB_CE_MAX_LANES 3 + +#if SHA256_MB_CE_MAX_LANES >=4 +void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +#endif +#if SHA256_MB_CE_MAX_LANES >=3 +void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +#endif +#if SHA256_MB_CE_MAX_LANES >=2 +void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int); +#endif +void sha256_mb_ce_x1(SHA256_JOB *, int); + +#define LANE_IS_NOT_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FINISHED(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL) +#define LANE_IS_FREE(state,i) \ + (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL) +#define LANE_IS_INVALID(state,i) \ + (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL) +void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state) +{ + int i; + + state->unused_lanes = 0xf; + state->num_lanes_inuse = 0; + for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) { + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->lens[i] = i; + state->ldata[i].job_in_lane = 0; + } + + //lanes > SHA1_MB_CE_MAX_LANES is invalid lane + for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) { + state->lens[i] = 0xf; + state->ldata[i].job_in_lane = 0; + } +} + +static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state) +{ + int lane_idx, len, i, lanes; + + int lane_idx_array[SHA256_MAX_LANES]; + + if (state->num_lanes_inuse == 0) { + return -1; + } +#if SHA256_MB_CE_MAX_LANES == 4 + if (state->num_lanes_inuse == 4) { + len = min(min(state->lens[0], state->lens[1]), + min(state->lens[2], state->lens[3])); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x4(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, + state->ldata[3].job_in_lane, len >> 4); + + } else +#elif SHA256_MB_CE_MAX_LANES == 3 + if (state->num_lanes_inuse == 3) { + len = min(min(state->lens[0], state->lens[1]), state->lens[2]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x3(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, + state->ldata[2].job_in_lane, len >> 4); + + } else +#elif SHA256_MB_CE_MAX_LANES == 2 + if (state->num_lanes_inuse == 2) { + len = min(state->lens[0], state->lens[1]); + lane_idx = len & 0xf; + len &= ~0xf; + + sha256_mb_ce_x2(state->ldata[0].job_in_lane, + state->ldata[1].job_in_lane, len >> 4); + + } else +#endif + { + lanes = 0, len = 0; + for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + if (lanes) + len = min(len, state->lens[i]); + else + len = state->lens[i]; + lane_idx_array[lanes] = i; + lanes++; + } + } + if (lanes == 0) + return -1; + lane_idx = len & 0xf; + len = len & (~0xf); +#if SHA256_MB_CE_MAX_LANES >=4 + if (lanes == 4) { + sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, + state->ldata[lane_idx_array[3]].job_in_lane, len >> 4); + + } else +#endif +#if SHA256_MB_CE_MAX_LANES >=3 + if (lanes == 3) { + sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, + state->ldata[lane_idx_array[2]].job_in_lane, len >> 4); + } else +#endif +#if SHA256_MB_CE_MAX_LANES >=2 + if (lanes == 2) { + sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane, + state->ldata[lane_idx_array[1]].job_in_lane, len >> 4); + } else +#endif + { + sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4); + } + } + //only return the min length job + for (i = 0; i < SHA256_MAX_LANES; i++) { + if (LANE_IS_NOT_FINISHED(state, i)) { + state->lens[i] -= len; + state->ldata[i].job_in_lane->len -= len; + state->ldata[i].job_in_lane->buffer += len << 2; + } + } + + return lane_idx; + +} + +static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state) +{ + int i; + SHA256_JOB *ret = NULL; + + for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) { + if (LANE_IS_FINISHED(state, i)) { + + state->unused_lanes <<= 4; + state->unused_lanes |= i; + state->num_lanes_inuse--; + ret = state->ldata[i].job_in_lane; + ret->status = STS_COMPLETED; + state->ldata[i].job_in_lane = NULL; + break; + } + } + return ret; +} + +static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job) +{ + int lane_idx; + //add job into lanes + lane_idx = state->unused_lanes & 0xf; + //fatal error + assert(lane_idx < SHA256_MB_CE_MAX_LANES); + state->lens[lane_idx] = (job->len << 4) | lane_idx; + state->ldata[lane_idx].job_in_lane = job; + state->unused_lanes >>= 4; + state->num_lanes_inuse++; +} + +SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job) +{ +#ifndef NDEBUG + int lane_idx; +#endif + SHA256_JOB *ret; + + //add job into lanes + sha256_mb_mgr_insert_job(state, job); + + ret = sha256_mb_mgr_free_lane(state); + if (ret != NULL) { + return ret; + } + //submit will wait all lane has data + if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES) + return NULL; +#ifndef NDEBUG + lane_idx = sha256_mb_mgr_do_jobs(state); + assert(lane_idx != -1); +#else + sha256_mb_mgr_do_jobs(state); +#endif + + //~ i = lane_idx; + ret = sha256_mb_mgr_free_lane(state); + return ret; +} + +SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state) +{ + SHA256_JOB *ret; + ret = sha256_mb_mgr_free_lane(state); + if (ret) { + return ret; + } + + sha256_mb_mgr_do_jobs(state); + return sha256_mb_mgr_free_lane(state); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S new file mode 100644 index 000000000..ecc5fc5f5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#include <aarch64_multibinary.h> + + +mbin_interface sha256_ctx_mgr_submit +mbin_interface sha256_ctx_mgr_init +mbin_interface sha256_ctx_mgr_flush diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S new file mode 100644 index 000000000..06d0ab5fa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S @@ -0,0 +1,238 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l0_abcd_saved,5 + declare_var_vector_reg l0_efgh_saved,6 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,2 + declare_var_vector_reg l0_tmp1,3 + declare_var_vector_reg l0_tmp2,4 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + + + +/* + void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len); +*/ +/* +Arguements list +*/ + l0_job .req x0 + len .req w1 + l0_data .req x2 + tmp .req x3 + .global sha256_mb_ce_x1 + .type sha256_mb_ce_x1, %function +sha256_mb_ce_x1: + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + + + +start_loop: + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + + + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + + + bgt start_loop + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + + ret + + .size sha256_mb_ce_x1, .-sha256_mb_ce_x1 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S new file mode 100644 index 000000000..dadf44bb0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S @@ -0,0 +1,289 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + mov l1_tmp2_v.16b,l1_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l0_abcd_saved,2 + declare_var_vector_reg l0_efgh_saved,3 + declare_var_vector_reg l1_abcd,4 + declare_var_vector_reg l1_efgh,5 + declare_var_vector_reg l1_abcd_saved,6 + declare_var_vector_reg l1_efgh_saved,7 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,8 + declare_var_vector_reg l0_tmp1,9 + declare_var_vector_reg l0_tmp2,10 + declare_var_vector_reg l1_tmp0,11 + declare_var_vector_reg l1_tmp1,12 + declare_var_vector_reg l1_tmp2,13 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + + + +/* + void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + len .req w2 + l0_data .req x3 + l1_data .req x4 + tmp .req x5 + .global sha256_mb_ce_x2 + .type sha256_mb_ce_x2, %function +sha256_mb_ce_x2: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + sub len, len, #1 + cmp len, 0 + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + mov l1_abcd_saved_v.16b,l1_abcd_v.16b + mov l1_efgh_saved_v.16b,l1_efgh_v.16b + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + mov l1_tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s + + + + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + + + bgt start_loop + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x2, .-sha256_mb_ce_x2 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S new file mode 100644 index 000000000..6ed1591ba --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S @@ -0,0 +1,342 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req + ldr key_q , [tmp] + mov l0_tmp2_v.16b,l0_abcd_v.16b + mov l1_tmp2_v.16b,l1_abcd_v.16b + mov l2_tmp2_v.16b,l2_abcd_v.16b + add tmp,tmp,16 + add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s + add l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s + sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s + sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s + sha256h l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s + sha256h2 l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0,\tmp1 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s + sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,31 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l1_abcd,2 + declare_var_vector_reg l1_efgh,3 + declare_var_vector_reg l2_abcd,4 + declare_var_vector_reg l2_efgh,5 + declare_var_vector_reg l1_abcd_saved,16 + declare_var_vector_reg l1_efgh_saved,17 + declare_var_vector_reg l0_abcd_saved,20 + declare_var_vector_reg l0_efgh_saved,21 + declare_var_vector_reg l2_abcd_saved,24 + declare_var_vector_reg l2_efgh_saved,25 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,6 + declare_var_vector_reg l0_tmp1,7 + declare_var_vector_reg l0_tmp2,8 + declare_var_vector_reg l1_tmp0,9 + declare_var_vector_reg l1_tmp1,10 + declare_var_vector_reg l1_tmp2,11 + declare_var_vector_reg l2_tmp0,12 + declare_var_vector_reg l2_tmp1,13 + declare_var_vector_reg l2_tmp2,14 +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + declare_var_vector_reg l2_msg0,24 + declare_var_vector_reg l2_msg1,25 + declare_var_vector_reg l2_msg2,26 + declare_var_vector_reg l2_msg3,27 + + + +/* + void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + l2_job .req x2 + len .req w3 + l0_data .req x4 + l1_data .req x5 + l2_data .req x6 + tmp .req x7 + .global sha256_mb_ce_x3 + .type sha256_mb_ce_x3, %function +sha256_mb_ce_x3: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + ldr l2_data, [l2_job] + ldr l2_abcd_q, [l2_job, 64] + ldr l2_efgh_q, [l2_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + add l2_data,l2_data,64 + sub len, len, #1 + cmp len, 0 +/* + //backup digest + mov l0_abcd_saved_v.16b,l0_abcd_v.16b + mov l0_efgh_saved_v.16b,l0_efgh_v.16b + mov l1_abcd_saved_v.16b,l1_abcd_v.16b + mov l1_efgh_saved_v.16b,l1_efgh_v.16b + mov l2_abcd_saved_v.16b,l2_abcd_v.16b + mov l2_efgh_saved_v.16b,l2_efgh_v.16b +*/ + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + rev32 l2_msg0_v.16b,l2_msg0_v.16b + rev32 l2_msg1_v.16b,l2_msg1_v.16b + add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s + rev32 l2_msg2_v.16b,l2_msg2_v.16b + rev32 l2_msg3_v.16b,l2_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0 + + + + sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */ + + /* msg0 msg1 is free , share with digest regs */ + ldr l0_abcd_saved_q, [l0_job, 64] + ldr l1_abcd_saved_q, [l1_job, 64] + ldr l2_abcd_saved_q, [l2_job, 64] + ldr l0_efgh_saved_q, [l0_job, 80] + ldr l1_efgh_saved_q, [l1_job, 80] + ldr l2_efgh_saved_q, [l2_job, 80] + + sha256_4_rounds_high msg2,tmp1,tmp0 + sha256_4_rounds_high msg3,tmp0,tmp1 + + /* rounds 60-63 */ + mov l0_tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s + sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s + + mov l1_tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s + sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s + + mov l2_tmp2_v.16b,l2_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s + sha256h2 l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s + + /* combine state */ + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s + add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s + + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + str l2_abcd_q, [l2_job, 64] + str l2_efgh_q, [l2_job, 80] + + bgt start_loop + + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x3, .-sha256_mb_ce_x3 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S new file mode 100644 index 000000000..b1686ada1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S @@ -0,0 +1,380 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + .align 2 + .p2align 3,,7 + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg +.endm +/** +maros for round 48-63 +tmp0 : in +tmp1 : out +*/ +.macro sha256_4_rounds_high msg:req,tmp0:req + ldr key_q , [tmp] + mov tmp0_v.16b,l0_\tmp0\()_v.16b + mov tmp1_v.16b,l1_\tmp0\()_v.16b + add l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s + add l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s + mov tmp2_v.16b,l0_abcd_v.16b + mov tmp3_v.16b,l1_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,tmp0_v.4s + sha256h l1_abcd_q,l1_efgh_q,tmp1_v.4s + sha256h2 l0_efgh_q,tmp2_q,tmp0_v.4s + sha256h2 l1_efgh_q,tmp3_q,tmp1_v.4s + + ldr key_q , [tmp] + mov tmp0_v.16b,l2_\tmp0\()_v.16b + mov tmp1_v.16b,l3_\tmp0\()_v.16b + add tmp,tmp,16 + add l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s + add l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s + mov tmp2_v.16b,l2_abcd_v.16b + mov tmp3_v.16b,l3_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,tmp0_v.4s + sha256h l3_abcd_q,l3_efgh_q,tmp1_v.4s + sha256h2 l2_efgh_q,tmp2_q,tmp0_v.4s + sha256h2 l3_efgh_q,tmp3_q,tmp1_v.4s + + +.endm +/** +maros for round 0-47 +*/ +.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req + sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s + sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s + sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s + sha256su0 l3_\msg0\()_v.4s,l3_\msg1\()_v.4s + sha256_4_rounds_high \msg1,\tmp0 + sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s + sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s + sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s + sha256su1 l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s +.endm + + +/* +Variable list +*/ + + declare_var_vector_reg key,15 + + +/* +digest variables +*/ + declare_var_vector_reg l0_abcd,0 + declare_var_vector_reg l0_efgh,1 + declare_var_vector_reg l1_abcd,2 + declare_var_vector_reg l1_efgh,3 + declare_var_vector_reg l2_abcd,4 + declare_var_vector_reg l2_efgh,5 + declare_var_vector_reg l3_abcd,6 + declare_var_vector_reg l3_efgh,7 + declare_var_vector_reg l1_abcd_saved,16 + declare_var_vector_reg l1_efgh_saved,17 + declare_var_vector_reg l0_abcd_saved,20 + declare_var_vector_reg l0_efgh_saved,21 + declare_var_vector_reg l2_abcd_saved,24 + declare_var_vector_reg l2_efgh_saved,25 + declare_var_vector_reg l3_abcd_saved,28 + declare_var_vector_reg l3_efgh_saved,29 +/* +Temporay variables +*/ + declare_var_vector_reg l0_tmp0,8 + declare_var_vector_reg l1_tmp0,9 + declare_var_vector_reg l2_tmp0,10 + declare_var_vector_reg l3_tmp0,11 + + declare_var_vector_reg tmp0,12 + declare_var_vector_reg tmp1,13 + declare_var_vector_reg tmp2,14 + declare_var_vector_reg tmp3,15 + +/* +Message variables +*/ + declare_var_vector_reg l0_msg0,16 + declare_var_vector_reg l0_msg1,17 + declare_var_vector_reg l0_msg2,18 + declare_var_vector_reg l0_msg3,19 + declare_var_vector_reg l1_msg0,20 + declare_var_vector_reg l1_msg1,21 + declare_var_vector_reg l1_msg2,22 + declare_var_vector_reg l1_msg3,23 + declare_var_vector_reg l2_msg0,24 + declare_var_vector_reg l2_msg1,25 + declare_var_vector_reg l2_msg2,26 + declare_var_vector_reg l2_msg3,27 + declare_var_vector_reg l3_msg0,28 + declare_var_vector_reg l3_msg1,29 + declare_var_vector_reg l3_msg2,30 + declare_var_vector_reg l3_msg3,31 + + + +/* + void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int); +*/ +/* +Arguements list +*/ + l0_job .req x0 + l1_job .req x1 + l2_job .req x2 + l3_job .req x3 + len .req w4 + l0_data .req x5 + l1_data .req x6 + l2_data .req x7 + l3_data .req x8 + tmp .req x9 + .global sha256_mb_ce_x4 + .type sha256_mb_ce_x4, %function +sha256_mb_ce_x4: + //push d8~d15 + stp d8,d9,[sp,-192]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + ldr l0_data, [l0_job] + ldr l0_abcd_q, [l0_job, 64] + ldr l0_efgh_q, [l0_job, 80] + ldr l1_data, [l1_job] + ldr l1_abcd_q, [l1_job, 64] + ldr l1_efgh_q, [l1_job, 80] + ldr l2_data, [l2_job] + ldr l2_abcd_q, [l2_job, 64] + ldr l2_efgh_q, [l2_job, 80] + ldr l3_data, [l3_job] + ldr l3_abcd_q, [l3_job, 64] + ldr l3_efgh_q, [l3_job, 80] + + + +start_loop: + + //load key addr + adr tmp, KEY + //load msgs + ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data] + ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data] + ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data] + ld1 {l3_msg0_v.4s-l3_msg3_v.4s},[l3_data] + ldr key_q,[tmp] + add tmp,tmp,16 + //adjust loop parameter + add l0_data,l0_data,64 + add l1_data,l1_data,64 + add l2_data,l2_data,64 + add l3_data,l3_data,64 + sub len, len, #1 + cmp len, 0 + + + rev32 l0_msg0_v.16b,l0_msg0_v.16b + rev32 l0_msg1_v.16b,l0_msg1_v.16b + add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s + rev32 l0_msg2_v.16b,l0_msg2_v.16b + rev32 l0_msg3_v.16b,l0_msg3_v.16b + + rev32 l1_msg0_v.16b,l1_msg0_v.16b + rev32 l1_msg1_v.16b,l1_msg1_v.16b + add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s + rev32 l1_msg2_v.16b,l1_msg2_v.16b + rev32 l1_msg3_v.16b,l1_msg3_v.16b + + rev32 l2_msg0_v.16b,l2_msg0_v.16b + rev32 l2_msg1_v.16b,l2_msg1_v.16b + add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s + rev32 l2_msg2_v.16b,l2_msg2_v.16b + rev32 l2_msg3_v.16b,l2_msg3_v.16b + + rev32 l3_msg0_v.16b,l3_msg0_v.16b + rev32 l3_msg1_v.16b,l3_msg1_v.16b + add l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s + rev32 l3_msg2_v.16b,l3_msg2_v.16b + rev32 l3_msg3_v.16b,l3_msg3_v.16b + + + + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 0-3 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 16-19 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 32-35 */ + sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0 + sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0 + sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0 + + + + sha256_4_rounds_high msg1,tmp0 /* rounds 48-51 */ + + /* msg0 msg1 is free , share with digest regs */ + ldr l0_abcd_saved_q, [l0_job, 64] + ldr l1_abcd_saved_q, [l1_job, 64] + ldr l2_abcd_saved_q, [l2_job, 64] + ldr l3_abcd_saved_q, [l3_job, 64] + ldr l0_efgh_saved_q, [l0_job, 80] + ldr l1_efgh_saved_q, [l1_job, 80] + ldr l2_efgh_saved_q, [l2_job, 80] + ldr l3_efgh_saved_q, [l3_job, 80] + + sha256_4_rounds_high msg2,tmp0 + sha256_4_rounds_high msg3,tmp0 + + /* rounds 60-63 */ + mov tmp2_v.16b,l0_abcd_v.16b + sha256h l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s + sha256h2 l0_efgh_q,tmp2_q,l0_tmp0_v.4s + + mov tmp2_v.16b,l1_abcd_v.16b + sha256h l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s + sha256h2 l1_efgh_q,tmp2_q,l1_tmp0_v.4s + + mov tmp2_v.16b,l2_abcd_v.16b + sha256h l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s + sha256h2 l2_efgh_q,tmp2_q,l2_tmp0_v.4s + + mov tmp2_v.16b,l3_abcd_v.16b + sha256h l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s + sha256h2 l3_efgh_q,tmp2_q,l3_tmp0_v.4s + + /* combine state */ + add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s + add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s + add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s + add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s + add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s + add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s + add l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s + add l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s + + str l0_abcd_q, [l0_job, 64] + str l0_efgh_q, [l0_job, 80] + str l1_abcd_q, [l1_job, 64] + str l1_efgh_q, [l1_job, 80] + str l2_abcd_q, [l2_job, 64] + str l2_efgh_q, [l2_job, 80] + str l3_abcd_q, [l3_job, 64] + str l3_efgh_q, [l3_job, 80] + + bgt start_loop + + + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], 192 + ret + + .size sha256_mb_ce_x4, .-sha256_mb_ce_x4 + .section .rol0_data.cst16,"aM",@progbits,16 + .align 4 +KEY: + .word 0x428A2F98 + .word 0x71374491 + .word 0xB5C0FBCF + .word 0xE9B5DBA5 + .word 0x3956C25B + .word 0x59F111F1 + .word 0x923F82A4 + .word 0xAB1C5ED5 + .word 0xD807AA98 + .word 0x12835B01 + .word 0x243185BE + .word 0x550C7DC3 + .word 0x72BE5D74 + .word 0x80DEB1FE + .word 0x9BDC06A7 + .word 0xC19BF174 + .word 0xE49B69C1 + .word 0xEFBE4786 + .word 0x0FC19DC6 + .word 0x240CA1CC + .word 0x2DE92C6F + .word 0x4A7484AA + .word 0x5CB0A9DC + .word 0x76F988DA + .word 0x983E5152 + .word 0xA831C66D + .word 0xB00327C8 + .word 0xBF597FC7 + .word 0xC6E00BF3 + .word 0xD5A79147 + .word 0x06CA6351 + .word 0x14292967 + .word 0x27B70A85 + .word 0x2E1B2138 + .word 0x4D2C6DFC + .word 0x53380D13 + .word 0x650A7354 + .word 0x766A0ABB + .word 0x81C2C92E + .word 0x92722C85 + .word 0xA2BFE8A1 + .word 0xA81A664B + .word 0xC24B8B70 + .word 0xC76C51A3 + .word 0xD192E819 + .word 0xD6990624 + .word 0xF40E3585 + .word 0x106AA070 + .word 0x19A4C116 + .word 0x1E376C08 + .word 0x2748774C + .word 0x34B0BCB5 + .word 0x391C0CB3 + .word 0x4ED8AA4A + .word 0x5B9CCA4F + .word 0x682E6FF3 + .word 0x748F82EE + .word 0x78A5636F + .word 0x84C87814 + .word 0x8CC70208 + .word 0x90BEFFFA + .word 0xA4506CEB + .word 0xBEF9A3F7 + .word 0xC67178F2 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c new file mode 100644 index 000000000..12441a8e3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c @@ -0,0 +1,268 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX +#elif (__GNUC__ >= 5) +# pragma GCC target("avx") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx_slver_02020154; +struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_submit_avx_slver_02020155; +struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 }; + +struct slver sha256_ctx_mgr_flush_avx_slver_02020156; +struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c new file mode 100644 index 000000000..9c045659e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c @@ -0,0 +1,268 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx2_slver_04020157; +struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 }; + +struct slver sha256_ctx_mgr_submit_avx2_slver_04020158; +struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 }; + +struct slver sha256_ctx_mgr_flush_avx2_slver_04020159; +struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 }; + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c new file mode 100644 index 000000000..a1f068987 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c @@ -0,0 +1,273 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx512_slver_0600015a; +struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 }; + +struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b; +struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 }; + +struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c; +struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c new file mode 100644 index 000000000..763057f12 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c @@ -0,0 +1,283 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) +#elif defined(__ICC) +# pragma intel optimization_parameter target_arch=AVX2 +#elif defined(__ICL) +# pragma [intel] optimization_parameter target_arch=AVX2 +#elif (__GNUC__ >= 5) +# pragma GCC target("avx2") +#endif + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +/** + * sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake. + * Since SHANI is still slower than multibuffer for full lanes, + * sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are + * similare with their avx512 versions. + * sha256_ctx_mgr_flush_avx512_ni is different. It will call + * sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less + * than a threshold. + * + */ +#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI) + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca; +struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 }; + +struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb; +struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 }; + +struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc; +struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 }; + +#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI + +#if defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c new file mode 100644 index 000000000..58bf024a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c @@ -0,0 +1,301 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define W(x) w[(x) & 15] + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len); +static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len); +static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len); +static void OPT_FIX sha256_single(const void *data, uint32_t digest[]); +static inline void hash_init_digest(SHA256_WORD_T * digest); + +void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr) +{ +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + uint32_t remain_len; + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) { + // Cannot submit a new entire job to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags == HASH_FIRST) { + + sha256_init(ctx, buffer, len); + sha256_update(ctx, buffer, len); + } + + if (flags == HASH_UPDATE) { + sha256_update(ctx, buffer, len); + } + + if (flags == HASH_LAST) { + remain_len = sha256_update(ctx, buffer, len); + sha256_final(ctx, remain_len); + } + + if (flags == HASH_ENTIRE) { + sha256_init(ctx, buffer, len); + remain_len = sha256_update(ctx, buffer, len); + sha256_final(ctx, remain_len); + } + + return ctx; +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr) +{ + return NULL; +} + +static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Mark it as processing + ctx->status = HASH_CTX_STS_PROCESSING; +} + +static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len) +{ + uint32_t remain_len = len; + uint32_t *digest = ctx->job.result_digest; + + while (remain_len >= SHA256_BLOCK_SIZE) { + sha256_single(buffer, digest); + buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE); + remain_len -= SHA256_BLOCK_SIZE; + ctx->total_length += SHA256_BLOCK_SIZE; + } + ctx->status = HASH_CTX_STS_IDLE; + ctx->incoming_buffer = buffer; + return remain_len; +} + +static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len) +{ + const void *buffer = ctx->incoming_buffer; + uint32_t i = remain_len, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + uint32_t *digest = ctx->job.result_digest; + + ctx->total_length += i; + memcpy(buf, buffer, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8); + + sha256_single(buf, digest); + if (i == 2 * SHA256_BLOCK_SIZE) { + sha256_single(buf + SHA256_BLOCK_SIZE, digest); + } + + ctx->status = HASH_CTX_STS_COMPLETE; +} + +void sha256_single(const void *data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_base_slver_000002f0; +struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 }; + +struct slver sha256_ctx_mgr_submit_base_slver_000002f1; +struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 }; + +struct slver sha256_ctx_mgr_flush_base_slver_000002f2; +struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c new file mode 100644 index 000000000..1483f631c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stdint.h> +#include <string.h> +#include "sha256_mb.h" +#include "memcpy_inline.h" + +extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr); +extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags); +extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr); + +void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr) +{ + return sha256_ctx_mgr_init_base(mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr) +{ + return sha256_ctx_mgr_flush_base(mgr); +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c new file mode 100644 index 000000000..f85f5c88b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr) +{ + sha256_mb_mgr_init_sse(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_sse_slver_00020151; +struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 }; + +struct slver sha256_ctx_mgr_submit_sse_slver_00020152; +struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 }; + +struct slver sha256_ctx_mgr_flush_sse_slver_00020153; +struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c new file mode 100644 index 000000000..e2c7e2738 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c @@ -0,0 +1,262 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" +#include "memcpy_inline.h" +#include "endian_helper.h" + +#ifdef _MSC_VER +# include <intrin.h> +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_SHANI + +static inline void hash_init_digest(SHA256_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len); +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx); + +void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr) +{ + // Same with sse + sha256_mb_mgr_init_sse(&mgr->mgr); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + } + } + + return sha256_ctx_mgr_resubmit(mgr, ctx); +} + +SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr) +{ + SHA256_HASH_CTX *ctx; + + while (1) { + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha256_ctx_mgr_resubmit(mgr, ctx); + + // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr, + SHA256_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA256_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA256_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + + ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA256_WORD_T * digest) +{ + static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] = + { SHA256_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) +{ + uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1)); + + memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA256_PADLENGTHFIELD_SIZE; + +#if SHA256_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3); + + return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7; +struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 }; + +struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8; +struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 }; + +struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9; +struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 }; + +#endif // HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm new file mode 100644 index 000000000..f9fb6d230 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm @@ -0,0 +1,65 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Threshold constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; if number of lanes in use <= threshold, using sb func +%define SHA256_SB_THRESHOLD_SSE 1 +%define SHA256_SB_THRESHOLD_AVX 1 +%define SHA256_SB_THRESHOLD_AVX2 1 +%define SHA256_SB_THRESHOLD_AVX512 1 +%define SHA256_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb +%define SHA256_NI_SB_THRESHOLD_AVX512 6 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_JOB + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SHA256_JOB_size _FIELD_OFFSET +%assign _SHA256_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c new file mode 100644 index 000000000..28f1f5118 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c @@ -0,0 +1,146 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS (SHA256_MAX_LANES - 1) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr) +{ + static int32_t last_lens[SHA256_MAX_LANES] = { 0 }; + int32_t len; + uint8_t num_unchanged = 0; + int i; + for (i = 0; i < SHA256_MAX_LANES; i++) { + len = (int32_t) mgr->mgr.lens[i]; + // len[i] in mgr consists of byte_length<<4 | lane_index + len = (len >= 16) ? (len >> 4 << 6) : 0; + printf("\t%d", len); + if (last_lens[i] > 0 && last_lens[i] == len) + num_unchanged += 1; + last_lens[i] = len; + } + printf("\n"); + return num_unchanged; +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + uint8_t num_ret, num_unchanged = 0; + int ret; + + printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1); + bufs[i] = (unsigned char *)malloc(lens[i]); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], lens[i]); + } + + for (i = 0; i < TEST_BUFS; i++) { + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + printf("Changes of lens inside mgr:\n"); + lens_print_and_check(mgr); + while (sha256_ctx_mgr_flush(mgr)) { + num_ret = lens_print_and_check(mgr); + num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret; + } + printf("Info of sha256_mb lens prints over\n"); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) + printf("Test failed function check %d\n", fail); + else if (num_unchanged) + printf("SHA-NI is used when %d or %d jobs are uncompleted\n", + num_unchanged, num_unchanged + 1); + else + printf("SHA-NI is not used, or used for last job\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm new file mode 100644 index 000000000..ebba9ca36 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm @@ -0,0 +1,74 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA256 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA256_ARGS_X16 +;;; name size align +FIELD _digest, 4*8*16, 4 ; transposed digest +FIELD _data_ptr, 8*16, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA256_ARGS_X4_size _FIELD_OFFSET +%assign _SHA256_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA256_ARGS_X8_size _FIELD_OFFSET +%assign _SHA256_ARGS_X8_align _STRUCT_ALIGN +%assign _SHA256_ARGS_X16_size _FIELD_OFFSET +%assign _SHA256_ARGS_X16_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align +FIELD _lens, 4*16, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..69f27f42d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm @@ -0,0 +1,253 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_avx +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx, function +sha256_mb_mgr_flush_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..0ee0589cf --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm @@ -0,0 +1,274 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x8_avx2 +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx2, function +sha256_mb_mgr_flush_avx2: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x8_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..201cd42b0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm @@ -0,0 +1,288 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha256_mb_x16_avx512 +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1 +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx512, function +sha256_mb_mgr_flush_avx512: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_mgr_flush_avx512 +no_sha256_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm new file mode 100644 index 000000000..7bc9d32a4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm @@ -0,0 +1,295 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + +extern sha256_mb_x16_avx512 +extern sha256_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define tmp4 rdx +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define tmp4 rsi +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1 +%define idx rbp + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_avx512_ni, function +sha256_mb_mgr_flush_avx512_ni: + endbranch + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512 + ja mb_processing + + ; lensN-len2=idx + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + + %else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_avx512_ni + no_sha256_mb_mgr_flush_avx512_ni: + %endif + %endif ; HAVE_AS_KNOWS_SHANI +%else +%ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_avx512_ni + no_sha256_mb_mgr_flush_avx512_ni: + %endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..69ae4bad5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm @@ -0,0 +1,254 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_sse +extern sha256_opt_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_sse, function +sha256_mb_mgr_flush_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func + cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_opt_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm new file mode 100644 index 000000000..43b8fcbe4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha256_mb_x4_sse +extern sha256_ni_x1 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state) +; arg 1 : rcx : state +mk_global sha256_mb_mgr_flush_sse_ni, function +sha256_mb_mgr_flush_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; use num_lanes_inuse to judge all lanes are empty + cmp dword [state + _num_lanes_inuse], 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func + cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE + ja mb_processing + + ; lensN-len2=idx + shr len2, 4 + mov [state + _lens + idx*4], DWORD(idx) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x1 + ; state and idx are intact + jmp len_is_0 + +mb_processing: + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_flush_sse_ni + no_sha256_mb_mgr_flush_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c new file mode 100644 index 000000000..903fb733b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_X8_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c new file mode 100644 index 000000000..b875735f9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_MAX_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c new file mode 100644 index 000000000..cf22c4aee --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha256_mb.h" + +void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes = 0xF3210; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA256_MIN_LANES; j++) { + state->lens[j] = 0; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..cb7d5790a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm @@ -0,0 +1,260 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_avx + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*5 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx, function +sha256_mb_mgr_submit_avx: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..af2fc89ea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm @@ -0,0 +1,246 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "memcpy.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x8_avx2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx2, function +sha256_mb_mgr_submit_avx2: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x8_avx2 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..cdc477370 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "memcpy.asm" +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +extern sha256_mb_x16_avx512 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define idx r8 +%define last_len r8 +%define p r11 +%define start_offset r11 +%define num_lanes_inuse r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp + +%define tmp r9 + +%define lane_data r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_avx512, function +sha256_mb_mgr_submit_avx512: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + 8*0], rbx + mov [rsp + 8*3], rbp + mov [rsp + 8*4], r12 + mov [rsp + 8*5], r13 + mov [rsp + 8*6], r14 + mov [rsp + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*1], rsi + mov [rsp + 8*2], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + mov [state + _lens + 4*lane], DWORD(len) + + mov [lane_data + _job_in_lane], job + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovdqu xmm1, [job + _result_digest + 1*16] + vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3 + vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1 + vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1 + vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2 + vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8 + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x16_avx512 + + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3 + vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16] + vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1 + vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2 + vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*1] + mov rdi, [rsp + 8*2] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*3] + mov r12, [rsp + 8*4] + mov r13, [rsp + 8*5] + mov r14, [rsp + 8*6] + mov r15, [rsp + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_mgr_submit_avx512 +no_sha256_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..b1bbc7002 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm @@ -0,0 +1,261 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha256_mb_x4_sse + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*5 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_sse, function +sha256_mb_mgr_submit_sse: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + movd [state + _args_digest + 4*lane + 4*16], xmm1 + pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm new file mode 100644 index 000000000..cb1dce641 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm @@ -0,0 +1,301 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_job.asm" +%include "sha256_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI +extern sha256_mb_x4_sse +extern sha256_ni_x2 + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE 16*10 +%define _GPR_SAVE 8*7 +%define STACK_SPACE _GPR_SAVE + _XMM_SAVE + +; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +mk_global sha256_mb_mgr_submit_sse_ni, function +sha256_mb_mgr_submit_sse_ni: + endbranch + + sub rsp, STACK_SPACE + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 + mov [rsp + _XMM_SAVE + 8*5], r13 + mov [rsp + _XMM_SAVE + 8*6], r14 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movd [state + _args_digest + 4*lane + 0*16], xmm0 + pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3 + movd [state + _args_digest + 4*lane + 4*16], xmm1 + pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1 + pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2 + pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + add dword [state + _num_lanes_inuse], 1 + + cmp unused_lanes, 0xF32 ; we will process two jobs at the same time + jne return_null ; wait for another sha_ni job + + ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func + %if SHA256_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb + ; shani glue code + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + ; lensN-len2=idx + sub lens0, len2 + sub lens1, len2 + + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov r10, idx + or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16 + ; "state" and "args" are the same address, arg1 + ; len is arg2, idx and nlane in r10 + call sha256_ni_x2 + ; state and idx are intact + %else + ; original mb code + cmp unused_lanes, 0xF + jne return_null + + start_loop: + ; Find min length + mov DWORD(lens0), [state + _lens + 0*4] + mov idx, lens0 + mov DWORD(lens1), [state + _lens + 1*4] + cmp lens1, idx + cmovb idx, lens1 + mov DWORD(lens2), [state + _lens + 2*4] + cmp lens2, idx + cmovb idx, lens2 + mov DWORD(lens3), [state + _lens + 3*4] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 4 + mov [state + _lens + 0*4], DWORD(lens0) + mov [state + _lens + 1*4], DWORD(lens1) + mov [state + _lens + 2*4], DWORD(lens2) + mov [state + _lens + 3*4], DWORD(lens3) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_mb_x4_sse + ; state and idx are intact + %endif +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + sub dword [state + _num_lanes_inuse], 1 + + movd xmm0, [state + _args_digest + 4*idx + 0*16] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3 + movd xmm1, [state + _args_digest + 4*idx + 4*16] + pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1 + pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2 + pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov r13, [rsp + _XMM_SAVE + 8*5] + mov r14, [rsp + _XMM_SAVE + 8*6] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + +%else + %ifidn __OUTPUT_FORMAT__, win64 + global no_sha256_mb_mgr_submit_sse_ni + no_sha256_mb_mgr_submit_sse_ni: + %endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c new file mode 100644 index 000000000..768bfca78 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c @@ -0,0 +1,160 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha256_mb.h" +#include "endian_helper.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + int ret; + + printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Random buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA256(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c new file mode 100644 index 000000000..adba77f3d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c @@ -0,0 +1,203 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + int ret; + + printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha256_mb test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha256_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha256_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha256 test + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha256_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c new file mode 100644 index 000000000..9535d80df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c @@ -0,0 +1,300 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "sha256_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA256_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA256_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS]; + +extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + int ret; + + printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha256_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha256 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha256_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha256_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha256_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha256_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha256_ctx_mgr_init(mgr); + + // Run sha256_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA256_BLOCK_SIZE + + SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA256_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, + HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha256_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha256_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA256_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha256_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c new file mode 100644 index 000000000..8a5b5a9b2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c @@ -0,0 +1,241 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "sha256_mb.h" + +typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS]; + +#define MSGS 7 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; +static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO"; +static uint8_t msg3[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<"; +static uint8_t msg4[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"; +static uint8_t msg5[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?"; +static uint8_t msg6[] = + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU"; +static uint8_t msg7[] = ""; + +static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039, + 0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1 +}; + +static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6, + 0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD +}; + +static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619, + 0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3 +}; + +static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407, + 0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383 +}; + +static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD, + 0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA +}; + +static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F, + 0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE +}; + +static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924, + 0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855 +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 }; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7 +}; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + int ret; + + ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if ((ret != 0) || (mgr == NULL)) { + printf("posix_memalign failed test aborted\n"); + return 1; + } + + sha256_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = sha256_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = sha256_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha256_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha256 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c new file mode 100644 index 000000000..51759d7a8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c @@ -0,0 +1,129 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha256_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 4000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 20 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha256_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha256_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha256_ctx_mgr_submit(mgr, + &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha256_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha256" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha256 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c new file mode 100644 index 000000000..235ec74a8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c @@ -0,0 +1,132 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/sha.h> +#include "sha256_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS SHA256_MAX_LANES + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS]; + +int main(void) +{ + SHA256_HASH_CTX_MGR *mgr = NULL; + SHA256_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + uint32_t nlanes; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR)); + if (ret) { + printf("alloc error: Fail"); + return -1; + } + sha256_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA256(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha256_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb shortage tests + for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) { + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < nlanes; i++) + sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, + HASH_ENTIRE); + + while (sha256_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < nlanes; i++) { + for (j = 0; j < SHA256_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + to_be32(((uint32_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + to_be32(((uint32_t *) digest_ssl[i])[j])); + } + } + } + } + + printf("Multi-buffer sha256 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha256_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm new file mode 100644 index 000000000..f45669c6e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm @@ -0,0 +1,930 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +[bits 64] +default rel +section .text + +;; code to compute oct SHA256 using SSE-256 / AVX512 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, 8*64, 64 +FIELD _rsp, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) mk_global func_name, function, internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 +%define TBL var2 + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 +;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 +;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 +;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 + +; Main processing loop per round +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddd T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, T1, %%WT ; T1 = T1 + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpaddd D, D, T1 ; D = D + T1 + + vprord H, A, 2 ; ROR_2(A) + vprord TMP2, A, 13 ; ROR_13(A) + vprord TMP3, A, 22 ; ROR_22(A) + vmovdqa32 TMP0, A + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +; This is supposed to be SKL optimized assuming: +; vpternlog, vpaddd ports 5,8 +; vprord ports 1,8 +; However, vprord is only working on port 8 +; +; Main processing loop per round +; Get the msg schedule word 16 from the current, now unneccessary word +%macro PROCESS_LOOP_00_47 5 +%define %%WT %1 +%define %%ROUND %2 +%define %%WTp1 %3 +%define %%WTp9 %4 +%define %%WTp14 %5 + ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = SIGMA0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + ;; For next value in msg schedule + ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt + + vmovdqa32 TMP0, E + vprord TMP1, E, 6 ; ROR_6(E) + vprord TMP2, E, 11 ; ROR_11(E) + vprord TMP3, E, 25 ; ROR_25(E) + vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddd T1, H, %%WT ; T1 = H + Wt + vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) + vpaddd T1, T1, TMP6 ; T1 = T1 + Kt + vprord H, A, 2 ; ROR_2(A) + vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vprord TMP2, A, 13 ; ROR_13(A) + vmovdqa32 TMP0, A + vprord TMP3, A, 22 ; ROR_22(A) + vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) + vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpaddd D, D, T1 ; D = D + T1 + vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpaddd H, H, T1 ; H(A) = H(T2) + T1 + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + + + vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_63 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) + vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) + vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 + + vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) + vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) + vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) + vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) + + vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + + ; Wt-7 + sigma0(Wt-15) + +%endmacro + +; Note this is reading in a block of data for one lane +; When all 16 are read, the data must be transposed to build msg schedule +%macro MSG_SCHED_ROUND_00_15 2 +%define %%WT %1 +%define %%OFFSET %2 + mov inp0, [IN + (%%OFFSET*8)] + vmovups %%WT, [inp0+IDX] +%endmacro + +align 64 + +;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha256_mb_x16_avx512) +sha256_mb_x16_avx512: + endbranch + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _rsp], rax + lea TBL, [TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*64] + vmovups B, [DIGEST + 1*64] + vmovups C, [DIGEST + 2*64] + vmovups D, [DIGEST + 3*64] + vmovups E, [DIGEST + 4*64] + vmovups F, [DIGEST + 5*64] + vmovups G, [DIGEST + 6*64] + vmovups H, [DIGEST + 7*64] + + ; Do we need to transpose digests??? + ; SHA1 does not, but SHA256 has been + + xor IDX, IDX + + ;; Read in first block of input data + ;; Transpose input data + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + + vmovups W0,[inp0+IDX] + vmovups W1,[inp1+IDX] + vmovups W2,[inp2+IDX] + vmovups W3,[inp3+IDX] + vmovups W4,[inp4+IDX] + vmovups W5,[inp5+IDX] + vmovups W6,[inp6+IDX] + vmovups W7,[inp7+IDX] + + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + + vmovups W8, [inp0+IDX] + vmovups W9, [inp1+IDX] + vmovups W10,[inp2+IDX] + vmovups W11,[inp3+IDX] + vmovups W12,[inp4+IDX] + vmovups W13,[inp5+IDX] + vmovups W14,[inp6+IDX] + vmovups W15,[inp7+IDX] + + +lloop: + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + vmovdqa32 TMP3, [TBL] ; First K + + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 64 + + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + +%assign I 0 +%rep 16 + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first 48 rounds + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep 48 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + + ; Check is this is the last block + sub SIZE, 1 + je lastLoop + + ; Process last 16 rounds + ; Read in next block msg data for use in first 16 words of msg sched +%assign I 48 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_00_15 APPEND(W,J), J +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] + + jmp lloop + +lastLoop: + ; Process last 16 rounds +%assign I 48 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + ; Write out digest + ; Do we need to untranspose digests??? + vmovups [DIGEST + 0*64], A + vmovups [DIGEST + 1*64], B + vmovups [DIGEST + 2*64], C + vmovups [DIGEST + 3*64], D + vmovups [DIGEST + 4*64], E + vmovups [DIGEST + 5*64], F + vmovups [DIGEST + 6*64], G + vmovups [DIGEST + 7*64], H + + + mov rsp, [rsp + _rsp] + ret + + section .data +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_mb_x16_avx512 +no_sha256_mb_x16_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm new file mode 100644 index 000000000..7f8f8829b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm @@ -0,0 +1,431 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA256 using AVX +;; Logic designed/laid out by JDG + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + +%define TABLE K256_4_MB +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + + + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, e ; ch: a2 = (f^g)&e + vpxor a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +%define DIGEST_SIZE 8*SZ4 +%define DATA 16*SZ4 +%define ALIGNMENT 1*8 +; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8 +%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT) +%define _DIGEST (DATA) + +%define VMOVPS vmovups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define IDX rax +%define ROUND rbx +%define TBL r12 + +;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len); +;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used) +;; arg 2 : arg2 : size of data in blocks (assumed >= 1) +;; +;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15 +;; +mk_global sha256_mb_x4_avx, function, internal +align 32 +sha256_mb_x4_avx: + endbranch + sub rsp, FRAMESZ + + ;; Initialize digests + vmovdqa a,[arg1+0*SZ4] + vmovdqa b,[arg1+1*SZ4] + vmovdqa c,[arg1+2*SZ4] + vmovdqa d,[arg1+3*SZ4] + vmovdqa e,[arg1+4*SZ4] + vmovdqa f,[arg1+5*SZ4] + vmovdqa g,[arg1+6*SZ4] + vmovdqa h,[arg1+7*SZ4] + + lea TBL,[TABLE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPS TT2,[inp0+IDX+i*16] + VMOVPS TT1,[inp1+IDX+i*16] + VMOVPS TT4,[inp2+IDX+i*16] + VMOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ4] + vpaddd b, b, [rsp + _DIGEST + 1*SZ4] + vpaddd c, c, [rsp + _DIGEST + 2*SZ4] + vpaddd d, d, [rsp + _DIGEST + 3*SZ4] + vpaddd e, e, [rsp + _DIGEST + 4*SZ4] + vpaddd f, f, [rsp + _DIGEST + 5*SZ4] + vpaddd g, g, [rsp + _DIGEST + 6*SZ4] + vpaddd h, h, [rsp + _DIGEST + 7*SZ4] + + + sub arg2, 1 + jne lloop + + ; write digests out + vmovdqa [arg1+0*SZ4],a + vmovdqa [arg1+1*SZ4],b + vmovdqa [arg1+2*SZ4],c + vmovdqa [arg1+3*SZ4],d + vmovdqa [arg1+4*SZ4],e + vmovdqa [arg1+5*SZ4],f + vmovdqa [arg1+6*SZ4],g + vmovdqa [arg1+7*SZ4],h + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [arg1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [arg1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [arg1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + ret + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm new file mode 100644 index 000000000..2d349abbc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm @@ -0,0 +1,426 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute quad SHA256 using SSE +;; Logic designed/laid out by JDG + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + +%define TABLE K256_4_MB +%define SZ 4 +%define SZ4 4*SZ +%define ROUNDS 64*SZ4 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + + + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + rsp],%%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + + movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp] + paddd a1, [SZ4*((%%i-7)&0xf) + rsp] + paddd %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +%define DIGEST_SIZE 8*SZ4 +%define DATA 16*SZ4 +%define ALIGNMENT 1*8 +; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8 +%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT) +%define _DIGEST (DATA) + +%define MOVPS movups + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define IDX rax +%define ROUND rbx +%define TBL r12 + +;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len); +;; arg 1 : pointer args (only 4 of the 8 lanes used) +;; arg 2 : size of data in blocks (assumed >= 1) +;; +;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15 +;; + +mk_global sha256_mb_x4_sse, function, internal +align 32 +sha256_mb_x4_sse: + endbranch + sub rsp, FRAMESZ + + ;; Initialize digests + movdqa a,[arg1+0*SZ4] + movdqa b,[arg1+1*SZ4] + movdqa c,[arg1+2*SZ4] + movdqa d,[arg1+3*SZ4] + movdqa e,[arg1+4*SZ4] + movdqa f,[arg1+5*SZ4] + movdqa g,[arg1+6*SZ4] + movdqa h,[arg1+7*SZ4] + + lea TBL,[TABLE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ4], a + movdqa [rsp + _DIGEST + 1*SZ4], b + movdqa [rsp + _DIGEST + 2*SZ4], c + movdqa [rsp + _DIGEST + 3*SZ4], d + movdqa [rsp + _DIGEST + 4*SZ4], e + movdqa [rsp + _DIGEST + 5*SZ4], f + movdqa [rsp + _DIGEST + 6*SZ4], g + movdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + MOVPS TT2,[inp0+IDX+i*16] + MOVPS TT1,[inp1+IDX+i*16] + MOVPS TT4,[inp2+IDX+i*16] + MOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + pshufb TT0, TMP + pshufb TT1, TMP + pshufb TT2, TMP + pshufb TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddd a, [rsp + _DIGEST + 0*SZ4] + paddd b, [rsp + _DIGEST + 1*SZ4] + paddd c, [rsp + _DIGEST + 2*SZ4] + paddd d, [rsp + _DIGEST + 3*SZ4] + paddd e, [rsp + _DIGEST + 4*SZ4] + paddd f, [rsp + _DIGEST + 5*SZ4] + paddd g, [rsp + _DIGEST + 6*SZ4] + paddd h, [rsp + _DIGEST + 7*SZ4] + + + sub arg2, 1 + jne lloop + + ; write digests out + movdqa [arg1+0*SZ4],a + movdqa [arg1+1*SZ4],b + movdqa [arg1+2*SZ4],c + movdqa [arg1+3*SZ4],d + movdqa [arg1+4*SZ4],e + movdqa [arg1+5*SZ4],f + movdqa [arg1+6*SZ4],g + movdqa [arg1+7*SZ4],h + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + add inp1, IDX + mov [arg1 + _data_ptr + 1*8], inp1 + add inp2, IDX + mov [arg1 + _data_ptr + 2*8], inp2 + add inp3, IDX + mov [arg1 + _data_ptr + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + ret + +section .data align=64 + +align 64 +TABLE: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm new file mode 100644 index 000000000..dbd9db1b8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm @@ -0,0 +1,620 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;; code to compute oct SHA256 using SSE-256 / AVX2 +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp r8 +;; +;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp r8 +;; +;; clobbers ymm0-15 + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux definitions + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx + %define reg3 rsi + %define reg4 rdi +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL reg3 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 reg4 + +; ymm0 a +; ymm1 b +; ymm2 c +; ymm3 d +; ymm4 e +; ymm5 f +; ymm6 g TMP0 +; ymm7 h TMP1 +; ymm8 T1 TT0 +; ymm9 TT1 +; ymm10 TT2 +; ymm11 TT3 +; ymm12 a0 TT4 +; ymm13 a1 TT5 +; ymm14 a2 TT6 +; ymm15 TMP TT7 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define T1 ymm8 + +%define a0 ymm12 +%define a1 ymm13 +%define a2 ymm14 +%define TMP ymm15 + +%define TMP0 ymm6 +%define TMP1 ymm7 + +%define TT0 ymm8 +%define TT1 ymm9 +%define TT2 ymm10 +%define TT3 ymm11 +%define TT4 ymm12 +%define TT5 ymm13 +%define TT6 ymm14 +%define TT7 ymm15 + +%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ8 +%define PTR_SZ 8 +%define SHA256_DIGEST_WORD_SIZE 4 +%define MAX_SHA256_LANES 8 +%define NUM_SHA256_DIGEST_WORDS 8 +%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE) + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ8 + .digest resb 8*SZ8 + .ytmp resb 4*SZ8 + .rsp resb 8 +endstruc +%define FRAMESZ stack_frame_size +%define _DIGEST stack_frame.digest +%define _YTMP stack_frame.ytmp +%define _RSP_SAVE stack_frame.rsp + +%define YTMP0 rsp + _YTMP + 0*SZ8 +%define YTMP1 rsp + _YTMP + 1*SZ8 +%define YTMP2 rsp + _YTMP + 2*SZ8 +%define YTMP3 rsp + _YTMP + 3*SZ8 + +%define VMOVPS vmovups + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ8 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp] + vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + +;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes); +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of input in blocks +mk_global sha256_mb_x8_avx2, function, internal +align 16 +sha256_mb_x8_avx2: + endbranch + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + ; save rsp, allocate 32-byte aligned for local variables + mov IDX, rsp + sub rsp, FRAMESZ + and rsp, ~31 + mov [rsp + _RSP_SAVE], IDX + + + ;; Load the pre-transposed incoming digest. + vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE] + vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE] + vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE] + vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE] + vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE] + vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE] + vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE] + vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE] + + lea TBL,[K256_8_MB] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ] + mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ] + mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ] + mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ] + mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ] + mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ] + mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ] + mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ8], a + vmovdqa [rsp + _DIGEST + 1*SZ8], b + vmovdqa [rsp + _DIGEST + 2*SZ8], c + vmovdqa [rsp + _DIGEST + 3*SZ8], d + vmovdqa [rsp + _DIGEST + 4*SZ8], e + vmovdqa [rsp + _DIGEST + 5*SZ8], f + vmovdqa [rsp + _DIGEST + 6*SZ8], g + vmovdqa [rsp + _DIGEST + 7*SZ8], h +%assign i 0 +%rep 2 + VMOVPS TT0,[inp0+IDX+i*32] + VMOVPS TT1,[inp1+IDX+i*32] + VMOVPS TT2,[inp2+IDX+i*32] + VMOVPS TT3,[inp3+IDX+i*32] + VMOVPS TT4,[inp4+IDX+i*32] + VMOVPS TT5,[inp5+IDX+i*32] + VMOVPS TT6,[inp6+IDX+i*32] + VMOVPS TT7,[inp7+IDX+i*32] + vmovdqa [YTMP0], g + vmovdqa [YTMP1], h + TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 + vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK] + vmovdqa g, [YTMP0] + vpshufb TT0, TT0, TMP1 + vpshufb TT1, TT1, TMP1 + vpshufb TT2, TT2, TMP1 + vpshufb TT3, TT3, TMP1 + vpshufb TT4, TT4, TMP1 + vpshufb TT5, TT5, TMP1 + vpshufb TT6, TT6, TMP1 + vpshufb TT7, TT7, TMP1 + vmovdqa h, [YTMP1] + vmovdqa [YTMP0], TT4 + vmovdqa [YTMP1], TT5 + vmovdqa [YTMP2], TT6 + vmovdqa [YTMP3], TT7 + ROUND_00_15 TT0,(i*8+0) + vmovdqa TT0, [YTMP0] + ROUND_00_15 TT1,(i*8+1) + vmovdqa TT1, [YTMP1] + ROUND_00_15 TT2,(i*8+2) + vmovdqa TT2, [YTMP2] + ROUND_00_15 TT3,(i*8+3) + vmovdqa TT3, [YTMP3] + ROUND_00_15 TT0,(i*8+4) + ROUND_00_15 TT1,(i*8+5) + ROUND_00_15 TT2,(i*8+6) + ROUND_00_15 TT3,(i*8+7) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + +%assign i (i*8) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ8] + vpaddd b, b, [rsp + _DIGEST + 1*SZ8] + vpaddd c, c, [rsp + _DIGEST + 2*SZ8] + vpaddd d, d, [rsp + _DIGEST + 3*SZ8] + vpaddd e, e, [rsp + _DIGEST + 4*SZ8] + vpaddd f, f, [rsp + _DIGEST + 5*SZ8] + vpaddd g, g, [rsp + _DIGEST + 6*SZ8] + vpaddd h, h, [rsp + _DIGEST + 7*SZ8] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a + vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b + vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c + vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d + vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e + vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f + vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g + vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _args_data_ptr + 0*8], inp0 + add inp1, IDX + mov [STATE + _args_data_ptr + 1*8], inp1 + add inp2, IDX + mov [STATE + _args_data_ptr + 2*8], inp2 + add inp3, IDX + mov [STATE + _args_data_ptr + 3*8], inp3 + add inp4, IDX + mov [STATE + _args_data_ptr + 4*8], inp4 + add inp5, IDX + mov [STATE + _args_data_ptr + 5*8], inp5 + add inp6, IDX + mov [STATE + _args_data_ptr + 6*8], inp6 + add inp7, IDX + mov [STATE + _args_data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, [rsp + _RSP_SAVE] + ret + +section .data +align 64 +K256_8_MB: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b + dq 0x0405060700010203, 0x0c0d0e0f08090a0b diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm new file mode 100644 index 000000000..af54f7cc3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm @@ -0,0 +1,125 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha256_ctx_mgr_init_sse +extern sha256_ctx_mgr_submit_sse +extern sha256_ctx_mgr_flush_sse + +extern sha256_ctx_mgr_init_avx +extern sha256_ctx_mgr_submit_avx +extern sha256_ctx_mgr_flush_avx + +extern sha256_ctx_mgr_init_avx2 +extern sha256_ctx_mgr_submit_avx2 +extern sha256_ctx_mgr_flush_avx2 + +extern sha256_ctx_mgr_init_base +extern sha256_ctx_mgr_submit_base +extern sha256_ctx_mgr_flush_base + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha256_ctx_mgr_init_avx512 + extern sha256_ctx_mgr_submit_avx512 + extern sha256_ctx_mgr_flush_avx512 +%endif + +%ifdef HAVE_AS_KNOWS_SHANI + extern sha256_ctx_mgr_init_sse_ni + extern sha256_ctx_mgr_submit_sse_ni + extern sha256_ctx_mgr_flush_sse_ni +%endif + +%ifdef HAVE_AS_KNOWS_AVX512 + %ifdef HAVE_AS_KNOWS_SHANI + extern sha256_ctx_mgr_init_avx512_ni + extern sha256_ctx_mgr_submit_avx512_ni + extern sha256_ctx_mgr_flush_avx512_ni + %endif +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha256_ctx_mgr_init +mbin_interface sha256_ctx_mgr_submit +mbin_interface sha256_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6's extension through replacing base by sse version + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \ + sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \ + sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \ + sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \ + sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni + mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \ + sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \ + sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni + %else + mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \ + sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \ + sha256_ctx_mgr_init_avx512 + mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \ + sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \ + sha256_ctx_mgr_submit_avx512 + mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \ + sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \ + sha256_ctx_mgr_flush_avx512 + %endif +%else + %ifdef HAVE_AS_KNOWS_SHANI + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \ + sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \ + sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni + mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \ + sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni + %else + mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \ + sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2 + mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \ + sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2 + mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \ + sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2 + %endif +%endif + +;;; func core, ver, snum +slversion sha256_ctx_mgr_init, 00, 04, 0160 +slversion sha256_ctx_mgr_submit, 00, 04, 0161 +slversion sha256_ctx_mgr_flush, 00, 04, 0162 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm new file mode 100644 index 000000000..25fc9ce16 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm @@ -0,0 +1,361 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define SHUF_MASK xmm8 + +%define ABEF_SAVE xmm9 +%define CDGH_SAVE xmm10 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define TMP r9 ; local variable -- assistant to address digest +%define TBL rax +;%define TMP2 r8 ; local variable -- assistant to address digest +align 32 + +; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r11, xmm0-xmm10 +; +mk_global sha256_ni_x1, function, internal +sha256_ni_x1: + endbranch + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*IDX] + ;; Initialize digest + ;; digests -> ABEF(state0), CDGH(state1) + pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A + pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B + pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D + pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E + pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F + pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + lea TBL, [TABLE] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + IDX*8] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; /* Save hash values for addition after rounds */ + movdqa ABEF_SAVE, STATE0 + movdqa CDGH_SAVE, STATE1 + + ; /* Rounds 0-3 */ + movdqu MSG, [DPTR + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 4-7 */ + movdqu MSG, [DPTR + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 8-11 */ + movdqu MSG, [DPTR + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 12-15 */ + movdqu MSG, [DPTR + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 16-19 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 20-23 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 24-27 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 28-31 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 32-35 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 36-39 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + ; /* Rounds 40-43 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + ; /* Rounds 44-47 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + ; /* Rounds 48-51 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + ; /* Rounds 52-55 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 56-59 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Rounds 60-63 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + ; /* Add current hash values with previously saved */ + paddd STATE0, ABEF_SAVE + paddd STATE1, CDGH_SAVE + + ; Increment data pointer and loop if more to process + add DPTR, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*IDX] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMP + 0*NLANX4], STATE0, 3 ; A + pextrd [TMP + 1*NLANX4], STATE0, 2 ; B + pextrd [TMP + 2*NLANX4], STATE1, 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], STATE1, 2 ; D + pextrd [TMP + 2*NLANX4], STATE0, 1 ; E + pextrd [TMP + 4*NLANX4], STATE1, 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMP + 2*NLANX4], STATE0, 0 ; F + pextrd [TMP + 4*NLANX4], STATE1, 0 ; H + + ; update input pointers + mov [MGR + _data_ptr + IDX*8], DPTR + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + + ret + + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_ni_x1 +no_sha256_ni_x1: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm new file mode 100644 index 000000000..74cfc93b6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm @@ -0,0 +1,574 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_SHANI + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define FRAMESZ 64 ; space for ABCDE +%define RSPSAVE rax + +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define STATE0b xmm8 +%define STATE1b xmm9 +%define MSGTMP0b xmm10 +%define MSGTMP1b xmm11 +%define MSGTMP2b xmm12 +%define MSGTMP3b xmm13 +%define MSGTMP4b xmm14 + +%define SHUF_MASK xmm15 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 +%define NBLK arg1 +%define NLANX4 r10 ; consistent with caller +%define IDX r8 ; local variable -- consistent with caller +%define DPTR r11 ; local variable -- input buffer pointer +%define DPTRb r12 +%define TMP r9 ; local variable -- assistant to address digest +%define TBL r13 +%define TMPb r14 ; local variable -- assistant to address digest +align 32 + +; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: rax, r9~r14, xmm0-xmm15 +; +mk_global sha256_ni_x2, function, internal +sha256_ni_x2: + endbranch + mov RSPSAVE, rsp + sub rsp, FRAMESZ + and rsp, ~0xF ; Align 16Bytes downward + + shl NBLK, 6 ; transform blk amount into bytes + jz backto_mgr + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + lea TMP, [MGR + 4*0] + lea TMPb, [MGR + 4*1] + + ;; Initialize digest + ;; digests -> ABEF(state0), CDGH(state1) + pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A + pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B + pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D + pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E + pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F + pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H + + pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A + pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B + pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D + pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E + pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G + lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F + pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H + + movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] + lea TBL, [TABLE] + + ;; Load input pointers + mov DPTR, [MGR + _data_ptr + 8*0] + mov DPTRb,[MGR + _data_ptr + 8*1] + ;; nblk is used to indicate data end + add NBLK, DPTR + +lloop: + ; /* Save hash values for addition after rounds */ + movdqa [rsp + 0*16], STATE0 + movdqa [rsp + 1*16], STATE1 + + movdqa [rsp + 2*16], STATE0b + movdqa [rsp + 3*16], STATE1b + + ; /* Rounds 0-3 */ + movdqu MSG, [DPTR + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqu MSG, [DPTRb + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0b, MSG + paddd MSG, [TBL + 0*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 4-7 */ + movdqu MSG, [DPTR + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqu MSG, [DPTRb + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1b, MSG + paddd MSG, [TBL + 1*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 8-11 */ + movdqu MSG, [DPTR + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqu MSG, [DPTRb + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2b, MSG + paddd MSG, [TBL + 2*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 12-15 */ + movdqu MSG, [DPTR + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqu MSG, [DPTRb + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3b, MSG + paddd MSG, [TBL + 3*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 16-19 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 4*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 20-23 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 5*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 24-27 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 6*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 28-31 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 7*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 32-35 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 8*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 36-39 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP0, MSGTMP1 + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 9*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP0b, MSGTMP1b + + ; /* Rounds 40-43 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP1, MSGTMP2 + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 10*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP1b, MSGTMP2b + + ; /* Rounds 44-47 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP3 + palignr MSGTMP4, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP4 + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP2, MSGTMP3 + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 11*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP3b + palignr MSGTMP4b, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP4b + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP2b, MSGTMP3b + + ; /* Rounds 48-51 */ + movdqa MSG, MSGTMP0 + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP0 + palignr MSGTMP4, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP4 + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + sha256msg1 MSGTMP3, MSGTMP0 + + movdqa MSG, MSGTMP0b + paddd MSG, [TBL + 12*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP0b + palignr MSGTMP4b, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP4b + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + sha256msg1 MSGTMP3b, MSGTMP0b + + ; /* Rounds 52-55 */ + movdqa MSG, MSGTMP1 + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP1 + palignr MSGTMP4, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP4 + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP1b + paddd MSG, [TBL + 13*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP1b + palignr MSGTMP4b, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP4b + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 56-59 */ + movdqa MSG, MSGTMP2 + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1, STATE0, MSG + movdqa MSGTMP4, MSGTMP2 + palignr MSGTMP4, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP4 + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP2b + paddd MSG, [TBL + 14*16] + sha256rnds2 STATE1b, STATE0b, MSG + movdqa MSGTMP4b, MSGTMP2b + palignr MSGTMP4b, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP4b + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Rounds 60-63 */ + movdqa MSG, MSGTMP3 + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1, STATE0, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG + + movdqa MSG, MSGTMP3b + paddd MSG, [TBL + 15*16] + sha256rnds2 STATE1b, STATE0b, MSG + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG + + ; /* Add current hash values with previously saved */ + paddd STATE0, [rsp + 0*16] + paddd STATE1, [rsp + 1*16] + + paddd STATE0b, [rsp + 2*16] + paddd STATE1b, [rsp + 3*16] + + ; Increment data pointer and loop if more to process + add DPTR, 64 + add DPTRb, 64 + cmp DPTR, NBLK + jne lloop + + ; write out digests + lea TMP, [MGR + 4*0] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMP + 0*NLANX4], STATE0, 3 ; A + pextrd [TMP + 1*NLANX4], STATE0, 2 ; B + pextrd [TMP + 2*NLANX4], STATE1, 3 ; C + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMP + 1*NLANX4], STATE1, 2 ; D + pextrd [TMP + 2*NLANX4], STATE0, 1 ; E + pextrd [TMP + 4*NLANX4], STATE1, 1 ; G + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMP + 2*NLANX4], STATE0, 0 ; F + pextrd [TMP + 4*NLANX4], STATE1, 0 ; H + + lea TMPb, [MGR + 4*1] + ;; ABEF(state0), CDGH(state1) -> digests + pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A + pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B + pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C + lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D + pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E + pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G + lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 + pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F + pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H + + ; update input pointers + mov [MGR + _data_ptr + 0*8], DPTR + mov [MGR + _data_ptr + 1*8], DPTRb + +backto_mgr: + ;;;;;;;;;;;;;;;; + ;; Postamble + mov rsp, RSPSAVE + + ret + +section .data align=16 +PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b +TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha256_ni_x2 +no_sha256_ni_x2: +%endif +%endif ; HAVE_AS_KNOWS_SHANI diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm new file mode 100644 index 000000000..fc13ec279 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm @@ -0,0 +1,567 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Implement fast SHA-256 with SSSE3 instructions. (x86_64) +; +; Copyright (C) 2013 Intel Corporation. +; +; Authors: +; James Guilford <james.guilford@intel.com> +; Kirk Yap <kirk.s.yap@intel.com> +; Tim Chen <tim.c.chen@linux.intel.com> +; Transcoded by: +; Xiaodong Liu <xiaodong.liu@intel.com> +; +; This software is available to you under the OpenIB.org BSD license +; below: +; +; Redistribution and use in source and binary forms, with or +; without modification, are permitted provided that the following +; conditions are met: +; +; - Redistributions of source code must retain the above +; copyright notice, this list of conditions and the following +; disclaimer. +; +; - Redistributions in binary form must reproduce the above +; copyright notice, this list of conditions and the following +; disclaimer in the documentation and/or other materials +; provided with the distribution. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +; SOFTWARE. +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "sha256_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 + ; Linux + %define arg0 rdi + %define arg1 rsi +%else + ; Windows + %define arg0 rcx + %define arg1 rdx +%endif + +%xdefine X0 xmm4 +%xdefine X1 xmm5 +%xdefine X2 xmm6 +%xdefine X3 xmm7 + +%xdefine XTMP0 xmm0 +%xdefine XTMP1 xmm1 +%xdefine XTMP2 xmm2 +%xdefine XTMP3 xmm3 +%xdefine XTMP4 xmm8 +%xdefine XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +; arg index is start from 0 while mgr_flush/submit is from 1 +%define MGR arg0 ; rdi or rcx +%define NBLK arg1 ; rsi or rdx +%define IDX r8 ; local variable -- consistent with caller +%define NLANX4 r10 ; consistent with caller, should be r10 + +%define TMGR r9 ; data pointer stored in stack named _TMGR +%define INP r9 ; data pointer stored in stack named _INP +%define SRND r9 ; clobbers INP +%define TMP r9 ; local variable -- assistant to address digest + +%xdefine TBL rbp +%xdefine c ecx +%xdefine d esi +%xdefine e edx +%xdefine a eax +%xdefine b ebx + +%xdefine f edi +%xdefine g r12d +%xdefine h r11d + +%xdefine y0 r13d +%xdefine y1 r14d +%xdefine y2 r15d + + +;; FRAMESZ plus pushes must be an odd multiple of 8 +%define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes +%define _INP_END_SIZE 8 +%define _INP_SIZE 8 +%define _TMGR_SIZE 8 +%define _XFER_SIZE 16 +%define _XMM_SAVE_SIZE 0 +%define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 + +%define _STACK_ALIGN 0 +%define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE) +%define _INP (_INP_END + _INP_END_SIZE) +%define _TMGR (_INP + _INP_SIZE) +%define _XFER (_TMGR + _TMGR_SIZE) +%define _XMM_SAVE (_XFER + _XFER_SIZE) +%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE) +%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE) + +;; assume buffers not aligned +%define MOVDQ movdqu + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 ;changed + mov %1, %2 ;changed +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 ;changed + pshufb %1, %3 ;changed +%endmacro + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endmacro + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endmacro + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) ; + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 ; + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) ; + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 ; + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25 + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22 + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 ; + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + rotate_Xs +%endmacro + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + %xdefine offset (%1 * 4 + _XFER) + add y2, [rsp + offset] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); +; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) +; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 +; invisibile arg 2 : IDX : hash on which lane +; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) +; (sse/avx is 4, avx2 is 8, avx512 is 16) +; +; Clobbers registers: all general regs, xmm0-xmm12 +; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text +mk_global sha256_opt_x1, function, internal +sha256_opt_x1: + endbranch + sub rsp, STACK_SIZE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], rbp +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rdi + mov [rsp + _GPR_SAVE + 8*3], rsi + ; caller has already stored XMM6~10 +%endif + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 + mov [rsp + _GPR_SAVE + 8*8], rdx + + shl NBLK, 6 ; convert to bytes + jz done_hash + + ; detach idx from nlanx4 + mov IDX, NLANX4 + shr NLANX4, 8 + and IDX, 0xff + + mov [rsp + _TMGR], MGR + ;; Load input pointers + mov INP, [MGR + _data_ptr + IDX*8] + mov [rsp + _INP], INP + ;; nblk is used to indicate data end + add NBLK, INP + mov [rsp + _INP_END], NBLK ; pointer to end of data + + + mov TMGR, [rsp + _TMGR] + ;; load initial digest + lea TMP, [TMGR + 4*IDX] + mov a, [TMP + 0*NLANX4] + mov b, [TMP + 1*NLANX4] + mov c, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + mov d, [TMP + 1*NLANX4] + mov e, [TMP + 2*NLANX4] + mov g, [TMP + 4*NLANX4] + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 + mov f, [TMP + 2*NLANX4] + mov h, [TMP + 4*NLANX4] + + movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK] + movdqa SHUF_00BA, [_SHUF_00BA] + movdqa SHUF_DC00, [_SHUF_DC00] + + mov INP, [rsp + _INP] +loop0: + lea TBL, [K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 + +loop1: + movdqa XFER, [TBL] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + ; write out digests + mov TMGR, [rsp + _TMGR] + lea TMP, [TMGR + 4*IDX] + addm a, [TMP + 0*NLANX4] + addm b, [TMP + 1*NLANX4] + addm c, [TMP + 2*NLANX4] + lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 + addm d, [TMP + 1*NLANX4] + addm e, [TMP + 2*NLANX4] + addm g, [TMP + 4*NLANX4] + lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 + addm f, [TMP + 2*NLANX4] + addm h, [TMP + 4*NLANX4] + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne loop0 + +done_hash: + mov MGR, [rsp + _TMGR] + + mov rdx, [rsp + _GPR_SAVE + 8*8] + mov r15, [rsp + _GPR_SAVE + 8*7] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r12, [rsp + _GPR_SAVE + 8*4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + _GPR_SAVE + 8*3] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbp, [rsp + _GPR_SAVE + 8*1] + mov rbx, [rsp + _GPR_SAVE + 8*0] + add rsp, STACK_SIZE + + ret + +section .data +align 64 +K256: + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + DQ 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: + DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF + +; shuffle xDxC -> DC00 +_SHUF_DC00: + DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c new file mode 100644 index 000000000..c3515dc52 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c @@ -0,0 +1,204 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <string.h> +#include "sha256_mb.h" +#include "endian_helper.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA256 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#if (__GNUC__ >= 11) +# define OPT_FIX __attribute__ ((noipa)) +#else +# define OPT_FIX +#endif + +#define H0 0x6a09e667 +#define H1 0xbb67ae85 +#define H2 0x3c6ef372 +#define H3 0xa54ff53a +#define H4 0x510e527f +#define H5 0x9b05688c +#define H6 0x1f83d9ab +#define H7 0x5be0cd19 + +#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r)))) + +#define W(x) w[(x) & 15] + +#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3)) +#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10)) + +#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22)) +#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) +#define ch(e,f,g) ((e & f) ^ (g & ~e)) + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = to_be32(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]); + +void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA256_BLOCK_SIZE]; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + digest[5] = H5; + digest[6] = H6; + digest[7] = H7; + + i = len; + while (i >= SHA256_BLOCK_SIZE) { + sha256_single(input_data, digest); + input_data += SHA256_BLOCK_SIZE; + i -= SHA256_BLOCK_SIZE; + } + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++) + buf[j] = 0; + + if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE) + i = 2 * SHA256_BLOCK_SIZE; + else + i = SHA256_BLOCK_SIZE; + + *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8); + + sha256_single(buf, digest); + if (i == 2 * SHA256_BLOCK_SIZE) + sha256_single(buf + SHA256_BLOCK_SIZE, digest); +} + +void sha256_single(const uint8_t * data, uint32_t digest[]) +{ + uint32_t a, b, c, d, e, f, g, h, t1, t2; + uint32_t w[16]; + uint32_t *ww = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98); + step(1, h, a, b, c, d, e, f, g, 0x71374491); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + step(4, e, f, g, h, a, b, c, d, 0x3956c25b); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98); + step(9, h, a, b, c, d, e, f, g, 0x12835b01); + step(10, g, h, a, b, c, d, e, f, 0x243185be); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + step(23, b, c, d, e, f, g, h, a, 0x76f988da); + step(24, a, b, c, d, e, f, g, h, 0x983e5152); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d); + step(26, g, h, a, b, c, d, e, f, 0xb00327c8); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351); + step(31, b, c, d, e, f, g, h, a, 0x14292967); + step(32, a, b, c, d, e, f, g, h, 0x27b70a85); + step(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + step(35, f, g, h, a, b, c, d, e, 0x53380d13); + step(36, e, f, g, h, a, b, c, d, 0x650a7354); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + step(39, b, c, d, e, f, g, h, a, 0x92722c85); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + step(41, h, a, b, c, d, e, f, g, 0xa81a664b); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + step(44, e, f, g, h, a, b, c, d, 0xd192e819); + step(45, d, e, f, g, h, a, b, c, 0xd6990624); + step(46, c, d, e, f, g, h, a, b, 0xf40e3585); + step(47, b, c, d, e, f, g, h, a, 0x106aa070); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116); + step(49, h, a, b, c, d, e, f, g, 0x1e376c08); + step(50, g, h, a, b, c, d, e, f, 0x2748774c); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f); + step(58, g, h, a, b, c, d, e, f, 0x84c87814); + step(59, f, g, h, a, b, c, d, e, 0x8cc70208); + step(60, e, f, g, h, a, b, c, d, 0x90befffa); + step(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} |