From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- .../isa-l/isa-l_crypto/sha512_mb/Makefile.am | 91 +++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c | 254 ++++++++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c | 254 ++++++++ .../isa-l_crypto/sha512_mb/sha512_ctx_avx512.c | 259 +++++++++ .../isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c | 254 ++++++++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c | 254 ++++++++ .../isa-l/isa-l_crypto/sha512_mb/sha512_job.asm | 54 ++ .../sha512_mb/sha512_mb_mgr_datastruct.asm | 72 +++ .../sha512_mb/sha512_mb_mgr_flush_avx.asm | 218 +++++++ .../sha512_mb/sha512_mb_mgr_flush_avx2.asm | 239 ++++++++ .../sha512_mb/sha512_mb_mgr_flush_avx512.asm | 266 +++++++++ .../sha512_mb/sha512_mb_mgr_flush_sse.asm | 221 +++++++ .../sha512_mb/sha512_mb_mgr_init_avx2.c | 44 ++ .../sha512_mb/sha512_mb_mgr_init_avx512.c | 42 ++ .../sha512_mb/sha512_mb_mgr_init_sse.c | 42 ++ .../sha512_mb/sha512_mb_mgr_submit_avx.asm | 258 +++++++++ .../sha512_mb/sha512_mb_mgr_submit_avx2.asm | 266 +++++++++ .../sha512_mb/sha512_mb_mgr_submit_avx512.asm | 279 +++++++++ .../sha512_mb/sha512_mb_mgr_submit_sse.asm | 256 +++++++++ .../sha512_mb/sha512_mb_rand_ssl_test.c | 171 ++++++ .../isa-l_crypto/sha512_mb/sha512_mb_rand_test.c | 197 +++++++ .../sha512_mb/sha512_mb_rand_update_test.c | 294 ++++++++++ .../isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c | 264 +++++++++ .../sha512_mb/sha512_mb_vs_ossl_perf.c | 143 +++++ .../isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm | 438 ++++++++++++++ .../isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm | 420 ++++++++++++++ .../isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm | 483 ++++++++++++++++ .../isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm | 639 +++++++++++++++++++++ .../isa-l_crypto/sha512_mb/sha512_multibinary.asm | 254 ++++++++ .../isa-l/isa-l_crypto/sha512_mb/sha512_ref.c | 256 +++++++++ .../sha512_mb/sha512_sb_mgr_flush_sse4.c | 46 ++ .../sha512_mb/sha512_sb_mgr_init_sse4.c | 38 ++ .../sha512_mb/sha512_sb_mgr_submit_sse4.c | 65 +++ .../isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm | 394 +++++++++++++ 34 files changed, 7725 insertions(+) create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c create mode 100644 src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm (limited to 'src/crypto/isa-l/isa-l_crypto/sha512_mb') diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am new file mode 100644 index 000000000..6fc22d132 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am @@ -0,0 +1,91 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc += sha512_mb/sha512_ctx_sse.c \ + sha512_mb/sha512_ctx_avx.c \ + sha512_mb/sha512_ctx_avx2.c \ + sha512_mb/sha512_ctx_sb_sse4.c + +lsrc += sha512_mb/sha512_mb_mgr_init_sse.c \ + sha512_mb/sha512_mb_mgr_init_avx2.c \ + sha512_mb/sha512_sb_mgr_init_sse4.c + + +lsrc += sha512_mb/sha512_mb_mgr_submit_sse.asm \ + sha512_mb/sha512_mb_mgr_submit_avx.asm \ + sha512_mb/sha512_mb_mgr_submit_avx2.asm \ + sha512_mb/sha512_mb_mgr_flush_sse.asm \ + sha512_mb/sha512_mb_mgr_flush_avx.asm \ + sha512_mb/sha512_mb_mgr_flush_avx2.asm \ + sha512_mb/sha512_mb_x2_sse.asm \ + sha512_mb/sha512_mb_x2_avx.asm \ + sha512_mb/sha512_mb_x4_avx2.asm \ + sha512_mb/sha512_multibinary.asm \ + sha512_mb/sha512_sb_mgr_submit_sse4.c \ + sha512_mb/sha512_sb_mgr_flush_sse4.c \ + sha512_mb/sha512_sse4.asm + +lsrc += sha512_mb/sha512_ctx_avx512.c \ + sha512_mb/sha512_mb_mgr_init_avx512.c \ + sha512_mb/sha512_mb_mgr_submit_avx512.asm \ + sha512_mb/sha512_mb_mgr_flush_avx512.asm \ + sha512_mb/sha512_mb_x8_avx512.asm + +extern_hdrs += include/sha512_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + sha512_mb/sha512_job.asm \ + sha512_mb/sha512_mb_mgr_datastruct.asm \ + include/reg_sizes.asm \ + sha512_mb/sha512_ref.c \ + include/memcpy_inline.h \ + include/memcpy.asm \ + include/intrinreg.h + +check_tests += sha512_mb/sha512_mb_test \ + sha512_mb/sha512_mb_rand_test \ + sha512_mb/sha512_mb_rand_update_test + +unit_tests += sha512_mb/sha512_mb_rand_ssl_test + +perf_tests += sha512_mb/sha512_mb_vs_ossl_perf + +sha512_mb_rand_test: sha512_ref.o +sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la + +sha512_mb_rand_update_test: sha512_ref.o +sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la + +sha512_mb_rand_ssl_test: LDLIBS += -lcrypto +sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto + +sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto +sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c new file mode 100644 index 000000000..4e5173155 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx_slver_02020166; +struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_submit_avx_slver_02020167; +struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 }; + +struct slver sha512_ctx_mgr_flush_avx_slver_02020168; +struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c new file mode 100644 index 000000000..d1b7d7270 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx2(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx2_slver_04020169; +struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 }; + +struct slver sha512_ctx_mgr_submit_avx2_slver_04020170; +struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 }; + +struct slver sha512_ctx_mgr_flush_avx2_slver_04020171; +struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c new file mode 100644 index 000000000..f99116eb1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c @@ -0,0 +1,259 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_avx512(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_avx512_slver_0600016a; +struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 }; + +struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b; +struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 }; + +struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c; +struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c new file mode 100644 index 000000000..6b44f075c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_sb_mgr_init_sse4(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx, const void *buffer, + uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172; +struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 }; + +struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173; +struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 }; + +struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174; +struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c new file mode 100644 index 000000000..b4dfe5332 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c @@ -0,0 +1,254 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +# include +# define inline __inline +#endif + +static inline void hash_init_digest(SHA512_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len); +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx); + +void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr) +{ + sha512_mb_mgr_init_sse(&mgr->mgr); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx, + const void *buffer, uint32_t len, + HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + + ctx = + (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return sha512_ctx_mgr_resubmit(mgr, ctx); +} + +SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr) +{ + SHA512_HASH_CTX *ctx; + + while (1) { + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = sha512_ctx_mgr_resubmit(mgr, ctx); + + // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop. + } +} + +static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr, + SHA512_HASH_CTX * ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % SHA512_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= SHA512_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(SHA512_WORD_T * digest) +{ + static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] = + { SHA512_INITIAL_DIGEST }; + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1); + + memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE); + padblock[i] = 0x80; + + // Move i to the end of either 1st or 2nd extra block depending on length + i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA512_PADLENGTHFIELD_SIZE; + +#if SHA512_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) & padblock[i - 16]) = 0; +#endif + + *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3); + + return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver sha512_ctx_mgr_init_sse_slver_00020163; +struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 }; + +struct slver sha512_ctx_mgr_submit_sse_slver_00020164; +struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 }; + +struct slver sha512_ctx_mgr_flush_sse_slver_00020165; +struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm new file mode 100644 index 000000000..7f2bdae48 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm @@ -0,0 +1,54 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA512_JOB structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; JOB_SHA512 + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 8, 8 ; length in bytes +FIELD _result_digest, 8*8, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 + +%assign _SHA512_JOB_size _FIELD_OFFSET +%assign _SHA512_JOB_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm new file mode 100644 index 000000000..d1578109e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm @@ -0,0 +1,72 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define SHA512 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; SHA512_ARGS_X8 +;;; name size align +FIELD _digest, 8*8*8, 4 ; transposed digest +FIELD _data_ptr, 8*8, 8 ; array of pointers to data +END_FIELDS + +%assign _SHA512_ARGS_X4_size _FIELD_OFFSET +%assign _SHA512_ARGS_X4_align _STRUCT_ALIGN +%assign _SHA512_ARGS_X8_size _FIELD_OFFSET +%assign _SHA512_ARGS_X8_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align +FIELD _lens, 8*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm new file mode 100644 index 000000000..33c62773a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm @@ -0,0 +1,218 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_avx +default rel + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +global sha512_mb_mgr_flush_avx:function +sha512_mb_mgr_flush_avx: + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm new file mode 100644 index 000000000..61c25aaef --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm @@ -0,0 +1,239 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x4_avx2 +default rel + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.xmm +%define _GPR_SAVE stack_frame.gpr +%define STACK_SPACE stack_frame_size + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +global sha512_mb_mgr_flush_avx2:function +sha512_mb_mgr_flush_avx2: + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + mov lens2, [state + _lens + 2*8] + cmp lens2, idx + cmovb idx, lens2 + mov lens3, [state + _lens + 3*8] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + mov [state + _lens + 2*8], lens2 + mov [state + _lens + 3*8], lens3 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x4_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm new file mode 100644 index 000000000..c16517821 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm @@ -0,0 +1,266 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern sha512_mb_x8_avx512 +default rel + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define num_lanes_inuse r9 +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define num_lanes_inuse r9 +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*8 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.xmm +%define _GPR_SAVE stack_frame.gpr +%define STACK_SPACE stack_frame_size + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +global sha512_mb_mgr_flush_avx512:function +sha512_mb_mgr_flush_avx512: + + mov rax, rsp + + sub rsp, STACK_SPACE + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqu [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqu [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqu [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqu [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqu [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqu [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqu [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqu [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqu [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqu [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 7 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx + vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} + vmovdqu ymm1, [state + _lens + 1*32] + + vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword + + vmovq idx, xmm2 + mov len2, idx + and idx, 0xF + shr len2, 32 ; SHA512 blocksize is 1024bit + jz len_is_0 + + vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} + vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} + vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x8_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovq xmm0, [state + _args_digest + 8*idx + 0*64] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*64] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*64] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*64] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqu xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqu xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqu xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqu xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqu xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqu xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqu xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqu xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqu xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index + dq 0xFFFFFFFF00000000, 0x0000000000000000 + dq 0xFFFFFFFF00000000, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_mgr_flush_avx512 +no_sha512_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm new file mode 100644 index 000000000..602d95330 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm @@ -0,0 +1,221 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_sse +default rel + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rsi +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define lens0 r8 + +%define lens1 r9 +%define lens2 r10 +%define lens3 r11 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*3 +_ALIGN_SIZE equ 0 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state) +; arg 1 : rcx : state +global sha512_mb_mgr_flush_sse:function +sha512_mb_mgr_flush_sse: + + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*1], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*2], rsi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_sse + ; state and idx are intact + + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + movq xmm0, [state + _args_digest + 8*idx + 0*32] + pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + movq xmm1, [state + _args_digest + 8*idx + 2*32] + pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + movq xmm2, [state + _args_digest + 8*idx + 4*32] + pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + movq xmm3, [state + _args_digest + 8*idx + 6*32] + pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + movdqa [job_rax + _result_digest + 2*16], xmm2 + movdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov r12, [rsp + _GPR_SAVE + 8*1] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +one: dq 1 +two: dq 2 +three: dq 3 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c new file mode 100644 index 000000000..da57e05d5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c @@ -0,0 +1,44 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->lens[2] = 2; + state->lens[3] = 3; + state->unused_lanes = 0xFF03020100; + for (j = 0; j < SHA512_X4_LANES; j++) { + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c new file mode 100644 index 000000000..2ce996cf1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c @@ -0,0 +1,42 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->unused_lanes = 0x0706050403020100; + state->num_lanes_inuse = 0; + for (j = 0; j < SHA512_MAX_LANES; j++) { + state->lens[j] = j; // sha512_mb uses low 32bit of lens to hold idx exclusively + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c new file mode 100644 index 000000000..d646d88fd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c @@ -0,0 +1,42 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state) +{ + unsigned int j; + + state->lens[0] = 0; + state->lens[1] = 1; + state->unused_lanes = 0xFF0100; + for (j = 0; j < SHA512_MIN_LANES; j++) { + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm new file mode 100644 index 000000000..d9ef88474 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm @@ -0,0 +1,258 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_avx + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global sha512_mb_mgr_submit_avx:function +sha512_mb_mgr_submit_avx: + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*32], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*32], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*32], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*32], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_avx + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm new file mode 100644 index 000000000..e39b8df4d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm @@ -0,0 +1,266 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x4_avx2 + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global sha512_mb_mgr_submit_avx2:function +sha512_mb_mgr_submit_avx2: + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqa [rsp + 16*0], xmm6 + vmovdqa [rsp + 16*1], xmm7 + vmovdqa [rsp + 16*2], xmm8 + vmovdqa [rsp + 16*3], xmm9 + vmovdqa [rsp + 16*4], xmm10 + vmovdqa [rsp + 16*5], xmm11 + vmovdqa [rsp + 16*6], xmm12 + vmovdqa [rsp + 16*7], xmm13 + vmovdqa [rsp + 16*8], xmm14 + vmovdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*32], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*32], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*32], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*32], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + mov lens2, [state + _lens + 2*8] + cmp lens2, idx + cmovb idx, lens2 + mov lens3, [state + _lens + 3*8] + cmp lens3, idx + cmovb idx, lens3 + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + sub lens2, len2 + sub lens3, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + mov [state + _lens + 2*8], lens2 + mov [state + _lens + 3*8], lens3 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x4_avx2 + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + vmovq xmm0, [state + _args_digest + 8*idx + 0*32] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*32] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*32] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*32] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 16*0] + vmovdqa xmm7, [rsp + 16*1] + vmovdqa xmm8, [rsp + 16*2] + vmovdqa xmm9, [rsp + 16*3] + vmovdqa xmm10, [rsp + 16*4] + vmovdqa xmm11, [rsp + 16*5] + vmovdqa xmm12, [rsp + 16*6] + vmovdqa xmm13, [rsp + 16*7] + vmovdqa xmm14, [rsp + 16*8] + vmovdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm new file mode 100644 index 000000000..59f359f1f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm @@ -0,0 +1,279 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern sha512_mb_x8_avx512 + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define num_lanes_inuse r9 +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*8 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global sha512_mb_mgr_submit_avx512:function +sha512_mb_mgr_submit_avx512: + + mov rax, rsp + + sub rsp, STACK_SPACE + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 + mov [rsp + _XMM_SAVE + 8*5], r13 + mov [rsp + _XMM_SAVE + 8*6], r14 + mov [rsp + _XMM_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + vmovdqu [rsp + 16*0], xmm6 + vmovdqu [rsp + 16*1], xmm7 + vmovdqu [rsp + 16*2], xmm8 + vmovdqu [rsp + 16*3], xmm9 + vmovdqu [rsp + 16*4], xmm10 + vmovdqu [rsp + 16*5], xmm11 + vmovdqu [rsp + 16*6], xmm12 + vmovdqu [rsp + 16*7], xmm13 + vmovdqu [rsp + 16*8], xmm14 + vmovdqu [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + + ; Load digest words from result_digest + vmovdqa xmm0, [job + _result_digest + 0*16] + vmovdqa xmm1, [job + _result_digest + 1*16] + vmovdqa xmm2, [job + _result_digest + 2*16] + vmovdqa xmm3, [job + _result_digest + 3*16] + vmovq [state + _args_digest + 8*lane + 0*64], xmm0 + vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1 + vmovq [state + _args_digest + 8*lane + 2*64], xmm1 + vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1 + vmovq [state + _args_digest + 8*lane + 4*64], xmm2 + vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1 + vmovq [state + _args_digest + 8*lane + 6*64], xmm3 + vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 8 + jne return_null + +start_loop: + ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx + vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a} + vmovdqu ymm1, [state + _lens + 1*32] + + vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i} + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i} + vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword + + vmovq idx, xmm2 + mov len2, idx + and idx, 0xF + shr len2, 32 + jz len_is_0 + + + vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i} + vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0} + vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0} + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x8_avx512 + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + vmovq xmm0, [state + _args_digest + 8*idx + 0*64] + vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1 + vmovq xmm1, [state + _args_digest + 8*idx + 2*64] + vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1 + vmovq xmm2, [state + _args_digest + 8*idx + 4*64] + vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1 + vmovq xmm3, [state + _args_digest + 8*idx + 6*64] + vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1 + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + vmovdqa [job_rax + _result_digest + 1*16], xmm1 + vmovdqa [job_rax + _result_digest + 2*16], xmm2 + vmovdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 16*0] + vmovdqu xmm7, [rsp + 16*1] + vmovdqu xmm8, [rsp + 16*2] + vmovdqu xmm9, [rsp + 16*3] + vmovdqu xmm10, [rsp + 16*4] + vmovdqu xmm11, [rsp + 16*5] + vmovdqu xmm12, [rsp + 16*6] + vmovdqu xmm13, [rsp + 16*7] + vmovdqu xmm14, [rsp + 16*8] + vmovdqu xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov r13, [rsp + _XMM_SAVE + 8*5] + mov r14, [rsp + _XMM_SAVE + 8*6] + mov r15, [rsp + _XMM_SAVE + 8*7] + + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=32 + +align 32 +clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index + dq 0xFFFFFFFF00000000, 0x0000000000000000 + dq 0xFFFFFFFF00000000, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_mgr_submit_avx512 +no_sha512_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm new file mode 100644 index 000000000..8b630a4da --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm @@ -0,0 +1,256 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_job.asm" +%include "sha512_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern sha512_mb_x2_sse + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define idx rdx ; rsi +%define last_len rdx ; rsi + +%define size_offset rcx ; rdi +%define tmp2 rcx ; rdi + +%else +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +; idx needs to be other than arg1, arg2, rbx, r12 +%define last_len rsi +%define idx rsi + +%define size_offset rdi +%define tmp2 rdi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 +%define p2 arg2 + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane rbp +%define tmp3 rbp +%define lens3 rbp + +%define extra_blocks r8 +%define lens0 r8 + +%define tmp r9 +%define lens1 r9 + +%define lane_data r10 +%define lens2 r10 + +struc stack_frame + .xmm: resb 16*10 + .gpr: resb 8*5 + .rsp: resb 8 +endstruc + +; STACK_SPACE needs to be an odd multiple of 8 +%define _XMM_SAVE stack_frame.gpr +%define _GPR_SAVE stack_frame.rsp +%define STACK_SPACE stack_frame_size + +; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global sha512_mb_mgr_submit_sse:function +sha512_mb_mgr_submit_sse: + + mov rax, rsp + + sub rsp, STACK_SPACE + and rsp, ~31 + + mov [rsp + stack_frame.rsp], rax + + mov [rsp + _XMM_SAVE + 8*0], rbx + mov [rsp + _XMM_SAVE + 8*1], rbp + mov [rsp + _XMM_SAVE + 8*2], r12 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _XMM_SAVE + 8*3], rsi + mov [rsp + _XMM_SAVE + 8*4], rdi + movdqa [rsp + 16*0], xmm6 + movdqa [rsp + 16*1], xmm7 + movdqa [rsp + 16*2], xmm8 + movdqa [rsp + 16*3], xmm9 + movdqa [rsp + 16*4], xmm10 + movdqa [rsp + 16*5], xmm11 + movdqa [rsp + 16*6], xmm12 + movdqa [rsp + 16*7], xmm13 + movdqa [rsp + 16*8], xmm14 + movdqa [rsp + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4 + 8*lane], DWORD(len) + + ; Load digest words from result_digest + movdqa xmm0, [job + _result_digest + 0*16] + movdqa xmm1, [job + _result_digest + 1*16] + movdqa xmm2, [job + _result_digest + 2*16] + movdqa xmm3, [job + _result_digest + 3*16] + movq [state + _args_digest + 8*lane + 0*32], xmm0 + pextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1 + movq [state + _args_digest + 8*lane + 2*32], xmm1 + pextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1 + movq [state + _args_digest + 8*lane + 4*32], xmm2 + pextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1 + movq [state + _args_digest + 8*lane + 6*32], xmm3 + pextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + cmp unused_lanes, 0xff + jne return_null + +start_loop: + + ; Find min length + mov lens0, [state + _lens + 0*8] + mov idx, lens0 + mov lens1, [state + _lens + 1*8] + cmp lens1, idx + cmovb idx, lens1 + + mov len2, idx + and idx, 0xF + and len2, ~0xFF + jz len_is_0 + + sub lens0, len2 + sub lens1, len2 + shr len2, 32 + mov [state + _lens + 0*8], lens0 + mov [state + _lens + 1*8], lens1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_mb_x2_sse + ; state and idx are intact + +len_is_0: + + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + movq xmm0, [state + _args_digest + 8*idx + 0*32] + pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1 + movq xmm1, [state + _args_digest + 8*idx + 2*32] + pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1 + movq xmm2, [state + _args_digest + 8*idx + 4*32] + pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1 + movq xmm3, [state + _args_digest + 8*idx + 6*32] + pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + movdqa [job_rax + _result_digest + 1*16], xmm1 + movdqa [job_rax + _result_digest + 2*16], xmm2 + movdqa [job_rax + _result_digest + 3*16], xmm3 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 16*0] + movdqa xmm7, [rsp + 16*1] + movdqa xmm8, [rsp + 16*2] + movdqa xmm9, [rsp + 16*3] + movdqa xmm10, [rsp + 16*4] + movdqa xmm11, [rsp + 16*5] + movdqa xmm12, [rsp + 16*6] + movdqa xmm13, [rsp + 16*7] + movdqa xmm14, [rsp + 16*8] + movdqa xmm15, [rsp + 16*9] + mov rsi, [rsp + _XMM_SAVE + 8*3] + mov rdi, [rsp + _XMM_SAVE + 8*4] +%endif + mov rbx, [rsp + _XMM_SAVE + 8*0] + mov rbp, [rsp + _XMM_SAVE + 8*1] + mov r12, [rsp + _XMM_SAVE + 8*2] + mov rsp, [rsp + stack_frame.rsp] + + ret + +return_null: + xor job_rax, job_rax + jmp return + +section .data align=16 + +align 16 +H0: dd 0x6a09e667 +H1: dd 0xbb67ae85 +H2: dd 0x3c6ef372 +H3: dd 0xa54ff53a +H4: dd 0x510e527f +H5: dd 0x9b05688c +H6: dd 0x1f83d9ab +H7: dd 0x5be0cd19 + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c new file mode 100644 index 000000000..edb57bc33 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c @@ -0,0 +1,171 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +uint64_t byteswap64(uint64_t x) +{ +#if defined (__ICC) + return _bswap64(x); +#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + return __builtin_bswap64(x); +#else + return (((x & (0xffull << 0)) << 56) + | ((x & (0xffull << 8)) << 40) + | ((x & (0xffull << 16)) << 24) + | ((x & (0xffull << 24)) << 8) + | ((x & (0xffull << 32)) >> 8) + | ((x & (0xffull << 40)) >> 24) + | ((x & (0xffull << 48)) >> 40) + | ((x & (0xffull << 56)) >> 56)); +#endif +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + + printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + SHA512(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + byteswap64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + byteswap64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Ramdom buffer with ramdom len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + SHA512(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + byteswap64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + byteswap64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c new file mode 100644 index 000000000..a1b805737 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c @@ -0,0 +1,197 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS]; + +// Compare against reference function +extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + + printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + sha512_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%016lX <=> 0x%016lX \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], lens[i]); + + // Run sha512_mb test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%016lX <=> 0x%016lX\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + sha512_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + sha512_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_sha512 test + sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (sha512_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%016lX" + ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c new file mode 100644 index 000000000..a05168b70 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c @@ -0,0 +1,294 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include "sha512_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*SHA512_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA512_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS]; + +extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + + printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + sha512_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + sha512_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_sha512 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = sha512_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha512_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha512_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8lX <=> %8lX", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + sha512_ref(bufs[i], digest_ref[i], lens[i]); + } + + sha512_ctx_mgr_init(mgr); + + // Run sha512_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = SHA512_BLOCK_SIZE + + SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % SHA512_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, + HASH_LAST); + else // submit the random update length as UPDATE + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = sha512_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = sha512_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % SHA512_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = sha512_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8lX <=> %8lX\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_sha512_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c new file mode 100644 index 000000000..747de43bb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c @@ -0,0 +1,264 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS]; + +#define MSGS 8 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog"; +static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog."; +static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 }; +static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 }; + +static uint8_t msg5[] = { + 0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93, + 0x7f, 0x05, 0x41, 0x16, 0 +}; + +static uint8_t msg6[] = { + 0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed, + 0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0 +}; + +static uint8_t msg7[] = { + 0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff, + 0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90, + 0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85, + 0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39, + 0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb, + 0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64, + 0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44, + 0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27, + 0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9, + 0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e, + 0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93, + 0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0 +}; + +static uint8_t msg8[] = ""; + +static DigestSHA512 expResultDigest1 = { + 0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64, + 0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6 +}; + +static DigestSHA512 expResultDigest2 = { + 0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb, + 0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed +}; + +static DigestSHA512 expResultDigest3 = { + 0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849, + 0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b +}; + +static DigestSHA512 expResultDigest4 = { + 0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635, + 0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5 +}; + +static DigestSHA512 expResultDigest5 = { + 0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5, + 0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25 +}; + +static DigestSHA512 expResultDigest6 = { + 0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c, + 0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231 +}; + +static DigestSHA512 expResultDigest7 = { + 0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c, + 0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072 +}; + +static DigestSHA512 expResultDigest8 = { + 0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce, + 0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e +}; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 }; + +static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2, + expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7, expResultDigest8 +}; + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint64_t *good; + + posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + sha512_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } + } + + while (1) { + ctx = sha512_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the " + "submit. Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + + ctx = sha512_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = sha512_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %016lX, " + "should be %016lX\n", t, j, + ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_sha512 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c new file mode 100644 index 000000000..8af563068 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c @@ -0,0 +1,143 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 1000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 10 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS]; + +inline uint64_t byteswap64(uint64_t x) +{ +#if defined (__ICC) + return _bswap64(x); +#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + return __builtin_bswap64(x); +#else + return (((x & (0xffull << 0)) << 56) + | ((x & (0xffull << 8)) << 40) + | ((x & (0xffull << 16)) << 24) + | ((x & (0xffull << 24)) << 8) + | ((x & (0xffull << 32)) >> 8) + | ((x & (0xffull << 40)) >> 24) + | ((x & (0xffull << 48)) >> 40) + | ((x & (0xffull << 56)) >> 56)); +#endif +} + +int main(void) +{ + SHA512_HASH_CTX_MGR *mgr = NULL; + SHA512_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR)); + sha512_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + SHA512(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("sha512_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + sha512_ctx_mgr_submit(mgr, + &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (sha512_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_sha512" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < SHA512_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + byteswap64(((uint64_t *) digest_ssl[i])[j])) { + fail++; + printf("Test%d, digest%d fail %016lX <=> %016lX\n", + i, j, ctxpool[i].job.result_digest[j], + byteswap64(((uint64_t *) digest_ssl[i])[j])); + } + } + } + + printf("Multi-buffer sha512 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf("multibinary_sha512_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm new file mode 100644 index 000000000..d1167dd49 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm @@ -0,0 +1,438 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +;; code to compute SHA512 by-2 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux definitions +%define arg1 rdi +%define arg2 rsi +%else +; Windows definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPD vmovupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1 + vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0 +%endm + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS +%endm + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA] + vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks) +;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used) +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +global sha512_mb_x2_avx:function internal +align 32 +sha512_mb_x2_avx: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[K512_2_MB] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ2], a + vmovdqa [rsp + _DIGEST + 1*SZ2], b + vmovdqa [rsp + _DIGEST + 2*SZ2], c + vmovdqa [rsp + _DIGEST + 3*SZ2], d + vmovdqa [rsp + _DIGEST + 4*SZ2], e + vmovdqa [rsp + _DIGEST + 5*SZ2], f + vmovdqa [rsp + _DIGEST + 6*SZ2], g + vmovdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + VMOVPD TT2,[inp1+IDX+i*16] + + TRANSPOSE TT0, TT2, TT1 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 8 * 16 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ2] + vpaddq b, b, [rsp + _DIGEST + 1*SZ2] + vpaddq c, c, [rsp + _DIGEST + 2*SZ2] + vpaddq d, d, [rsp + _DIGEST + 3*SZ2] + vpaddq e, e, [rsp + _DIGEST + 4*SZ2] + vpaddq f, f, [rsp + _DIGEST + 5*SZ2] + vpaddq g, g, [rsp + _DIGEST + 6*SZ2] + vpaddq h, h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + + ; outer calling routine restores XMM and other GP registers + ret + +section .data +K512_2_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + + +align 32 +; one from sha512_rorx +; this does the big endian to little endian conversion +; over a quad word +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm new file mode 100644 index 000000000..f492021ae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm @@ -0,0 +1,420 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +;; code to compute SHA512 by-2 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + + +%ifidn __OUTPUT_FORMAT__, elf64 +; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else +; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPD movupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + movapd %%t0, %%r0 ; t0 = a1 a0 + shufpd %%r0, %%r1, 00b ; r0 = b0 a0 + shufpd %%t0, %%r1, 11b ; t0 = b1 a1 +%endm + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psllq %%tmp, (64-(%%imm)) + psrlq %%reg, %%imm + por %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORQ a0, (18-14) ; sig1: a0 = (e >> 4) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORQ a1, 41 ; sig1: a1 = (e >> 41) + movdqa [SZ2*(%%i&0xf) + rsp],%%T1 + paddq %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + paddq h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORQ a2, (34-28) ; sig0: a2 = (a >> 6) + paddq h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORQ a1, 39 ; sig0: a1 = (a >> 39) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddq h, a0 + + paddq d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddq h, a1 ; h = h + ch + W + K + maj + paddq h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ2*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORQ %%T1, 8-1 + movdqa a2, a1 + PRORQ a1, 61-19 + pxor %%T1, a0 + PRORQ %%T1, 1 + pxor a1, a2 + PRORQ a1, 19 + psrlq a0, 7 + pxor %%T1, a0 + psrlq a2, 6 + pxor a1, a2 + paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp] + paddq a1, [SZ2*((%%i-7)&0xf) + rsp] + paddq %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks); +;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used) +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +global sha512_mb_x2_sse:function internal +align 32 +sha512_mb_x2_sse: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[K512_2_MB] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ2], a + movdqa [rsp + _DIGEST + 1*SZ2], b + movdqa [rsp + _DIGEST + 2*SZ2], c + movdqa [rsp + _DIGEST + 3*SZ2], d + movdqa [rsp + _DIGEST + 4*SZ2], e + movdqa [rsp + _DIGEST + 5*SZ2], f + movdqa [rsp + _DIGEST + 6*SZ2], g + movdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + MOVPD TT2,[inp1+IDX+i*16] + TRANSPOSE TT0, TT2, TT1 + pshufb TT0, TMP + pshufb TT1, TMP + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + add IDX, 8 * 16 ;; increment by a message block + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddq a, [rsp + _DIGEST + 0*SZ2] + paddq b, [rsp + _DIGEST + 1*SZ2] + paddq c, [rsp + _DIGEST + 2*SZ2] + paddq d, [rsp + _DIGEST + 3*SZ2] + paddq e, [rsp + _DIGEST + 4*SZ2] + paddq f, [rsp + _DIGEST + 5*SZ2] + paddq g, [rsp + _DIGEST + 6*SZ2] + paddq h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a + movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b + movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c + movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d + movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e + movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f + movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g + movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + ret + +section .data +align 64 +global K512_2_MB:data internal +K512_2_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm new file mode 100644 index 000000000..6931bedc1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm @@ -0,0 +1,483 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +;; code to compute quad SHA512 using AVX2 +;; use YMMs to tackle the larger digest size +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r13 r14 r15 +;; +;; Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers ymm0-15 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*4 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%ifidn __OUTPUT_FORMAT__, elf64 +; LINUX register definitions +%define arg1 rdi +%define arg2 rsi +%else +; Windows register definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r8 + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 + +%define a ymm0 +%define b ymm1 +%define c ymm2 +%define d ymm3 +%define e ymm4 +%define f ymm5 +%define g ymm6 +%define h ymm7 + +%define a0 ymm8 +%define a1 ymm9 +%define a2 ymm10 + +%define TT0 ymm14 +%define TT1 ymm13 +%define TT2 ymm12 +%define TT3 ymm11 +%define TT4 ymm10 +%define TT5 ymm9 + +%define T1 ymm14 +%define TMP ymm15 + +%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ4 + +; Define stack usage + +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESZ mod 32 must be 32-8 = 24 +struc stack_frame + .data resb 16*SZ4 + .digest resb NUM_SHA512_DIGEST_WORDS*SZ4 + .align resb 24 +endstruc + +%define _DIGEST stack_frame.digest + +%define VMOVPD vmovupd + +; operates on YMMs +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} +; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} +; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} +; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + ; vshufps does not cross the mid-way boundary and hence is cheaper + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + + vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} + + vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} + + vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} + + ; now ok to clobber t0 + vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} + +%endmacro + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS + +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] + vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + +;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE) +;; arg 1 : STATE : pointer to input data +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +global sha512_mb_x4_avx2:function internal +align 32 +sha512_mb_x4_avx2: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, stack_frame_size + + ;; Load the pre-transposed incoming digest. + vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE] + vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE] + vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE] + vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE] + vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE] + vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE] + vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE] + vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE] + + + lea TBL,[K512_4_MB] + + ;; load the address of each of the MAX_LANES (4) message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT2,[inp0+IDX+i*32] + VMOVPD TT1,[inp1+IDX+i*32] + VMOVPD TT4,[inp2+IDX+i*32] + VMOVPD TT3,[inp3+IDX+i*32] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 4 * 32 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ4] + vpaddq b, b, [rsp + _DIGEST + 1*SZ4] + vpaddq c, c, [rsp + _DIGEST + 2*SZ4] + vpaddq d, d, [rsp + _DIGEST + 3*SZ4] + vpaddq e, e, [rsp + _DIGEST + 4*SZ4] + vpaddq f, f, [rsp + _DIGEST + 5*SZ4] + vpaddq g, g, [rsp + _DIGEST + 6*SZ4] + vpaddq h, h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a + vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b + vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c + vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d + vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e + vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f + vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g + vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h + + ;; update input data pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, stack_frame_size + + ; outer calling routine restores XMM and other GP registers + ret + +section .data +align 64 +K512_4_MB: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 + +align 32 +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + + diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm new file mode 100644 index 000000000..cc8d85122 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm @@ -0,0 +1,639 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "sha512_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +default rel +;; code to compute quad SHA512 using AVX512 +;; use ZMMs to tackle the larger digest size +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31 +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rsi +;; +;; Linux clobbers: rax rbx rcx rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdx rdi +;; +;; clobbers zmm0-31 + +%define APPEND(a,b) a %+ b + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 preserved + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 preserved + %define reg4 r9 ; arg3 + %define var1 rdi ; usable + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var2 rdx ; arg2 + %define var1 rcx ; arg3 usable + %define local_func_decl(func_name) global func_name:function internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks + +%define IDX var1 +%define TBL r8 + +%define VMOVDQ32 vmovdqu32 + +%define SHA512_DIGEST_WORD_SIZE 8 +%define NUM_SHA512_DIGEST_WORDS 8 +%define SHA512_DIGEST_ROW_SIZE 8*8 +%define PTR_SZ 8 +%define _data_ptr_sha512 _data_ptr + +%define NUM_LANES 8 +%define SZ 8 +%define SZ8 8 * SZ +%define DIGEST_SZ 8 * SZ8 +%define DIGEST_SAVE NUM_LANES * DIGEST_SZ +%define RSP_SAVE 1*8 + +; Define Stack Layout +START_FIELDS +;;; name size align +FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64 +FIELD _RSP, 8, 8 +%assign STACK_SPACE _FIELD_OFFSET + + +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define inp7 rax + +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define E zmm4 +%define F zmm5 +%define G zmm6 +%define H zmm7 +%define T1 zmm8 +%define TMP0 zmm9 +%define TMP1 zmm10 +%define TMP2 zmm11 +%define TMP3 zmm12 +%define TMP4 zmm13 +%define TMP5 zmm14 +%define TMP6 zmm15 + + +%define W0 zmm16 +%define W1 zmm17 +%define W2 zmm18 +%define W3 zmm19 +%define W4 zmm20 +%define W5 zmm21 +%define W6 zmm22 +%define W7 zmm23 +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +; from sha256_fips180-2.pdf +; define rotates for Sigma function for main loop steps +%define BIG_SIGMA_0_0 28 ; Sigma0 +%define BIG_SIGMA_0_1 34 +%define BIG_SIGMA_0_2 39 +%define BIG_SIGMA_1_0 14 ; Sigma1 +%define BIG_SIGMA_1_1 18 +%define BIG_SIGMA_1_2 41 + +; define rotates for Sigma function for scheduling steps + +%define SMALL_SIGMA_0_0 1 ; sigma0 +%define SMALL_SIGMA_0_1 8 +%define SMALL_SIGMA_0_2 7 +%define SMALL_SIGMA_1_0 19 ; sigma1 +%define SMALL_SIGMA_1_1 61 +%define SMALL_SIGMA_1_2 6 + +%define SHA_MAX_ROUNDS 80 +%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16) + +%macro TRANSPOSE8 12 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 +%define %%PERM_INDEX1 %11 +%define %%PERM_INDEX2 %12 + + +; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} + + ;; ;;; will not get clobbered + vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp + vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp + + ; process top half (r0..r3) {a...d} + vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0} + vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1} + vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0} + vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1} + + vmovdqa32 %%r1, %%t0 ; r1 and r3 free + vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0} + vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2} + + vmovdqa32 %%t1, %%r0 ; t1 and r3 free + vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1} + vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + + ;; Likewise for top half ; r2 and r3 free + vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0} + vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1} + vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0} + vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1} + + vmovdqa32 %%r5, %%r2 ; r5 and r7 free + vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0} + vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2} + + vmovdqa32 %%r7, %%r4 + vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + +;;; free r3, r6 + vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + +;;; t0 and r3 free + vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} + vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + + + vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + + ;; will re-order input to avoid move + ;vmovdqa32 %%r7, %%t0 + + ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} + ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} + ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + ; temp + ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ H +%xdefine H G +%xdefine G F +%xdefine F E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + + + +;; CH(A, B, C) = (A&B) ^ (~A&C) +;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) +;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39 +;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41 +;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7 +;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6 + +;; Main processing loop per round +;; equivalent to %macro ROUND_00_15 2 +%macro PROCESS_LOOP 2 +%define %%WT %1 +%define %%ROUND %2 + ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt + ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C) + ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 + + ;; H becomes T2, then add T1 for A + ;; D becomes D + T1 for E + + vpaddq T1, H, TMP3 ; T1 = H + Kt + vmovdqa32 TMP0, E + ;; compute BIG_SIGMA_1(E) + vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E) + vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E) + vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E) + vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E) + vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) + vpaddq T1, T1, %%WT ; T1 = T1 + Wt + vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) + vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E) + vpaddq D, D, T1 ; D = D + T1 + vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A) + vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A) + vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A) + vmovdqa32 TMP0, A + vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) + vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A) + vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C) + vpaddq H, H, T1 ; H(A) = H(T2) + T1 + vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt + + ;; Rotate the args A-H (rotation of names associated with regs) + ROTATE_ARGS +%endmacro + +%macro MSG_SCHED_ROUND_16_79 4 +%define %%WT %1 +%define %%WTp1 %2 +%define %%WTp9 %3 +%define %%WTp14 %4 + vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2) + vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2) + vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2) + vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2) + + vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) + vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7 + + vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15) + vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15) + vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15) + vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15) + + vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) + + ; Wt-7 + sigma_0(Wt-15) + + +%endmacro + +align 64 + +; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size) +; arg 1 : pointer to input data +; arg 2 : size (in blocks) ;; assumed to be >= 1 +local_func_decl(sha512_mb_x8_avx512) +sha512_mb_x8_avx512: + mov rax, rsp + sub rsp, STACK_SPACE + and rsp, ~63 ; align stack to multiple of 64 + mov [rsp + _RSP], rax + lea TBL,[TABLE] + + ;; Initialize digests + vmovups A, [DIGEST + 0*8*8] + vmovups B, [DIGEST + 1*8*8] + vmovups C, [DIGEST + 2*8*8] + vmovups D, [DIGEST + 3*8*8] + vmovups E, [DIGEST + 4*8*8] + vmovups F, [DIGEST + 5*8*8] + vmovups G, [DIGEST + 6*8*8] + vmovups H, [DIGEST + 7*8*8] + + xor IDX, IDX + ;; Read in input data address, saving them in registers because + ;; they will serve as variables, which we shall keep incrementing + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + +lloop: + + ;; first half of 1024 (need to transpose before use) + vmovups W0,[inp0 + IDX ] + vmovups W1,[inp1 + IDX ] + vmovups W2,[inp2 + IDX ] + vmovups W3,[inp3 + IDX ] + vmovups W4,[inp4 + IDX ] + vmovups W5,[inp5 + IDX ] + vmovups W6,[inp6 + IDX ] + vmovups TMP0,[inp7 + IDX ] + TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3 + ;; second half of 1024 (need to transpose before use) + vmovups W8,[inp0 + SZ8 + IDX ] + vmovups W9,[inp1 + SZ8 + IDX ] + vmovups W10,[inp2 + SZ8 + IDX ] + vmovups W11,[inp3 + SZ8 + IDX ] + vmovups W12,[inp4 + SZ8 + IDX ] + vmovups W13,[inp5 + SZ8 + IDX ] + vmovups W14,[inp6 + SZ8 + IDX ] + vmovups TMP0,[inp7 + SZ8 + IDX ] + TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3 + + vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK] + + vmovdqa32 TMP3, [TBL] ; First K + + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + add IDX, 128 ; increment by message block length in bytes + + + + +%assign I 0 +%rep 16 +;;; little endian to big endian + vpshufb APPEND(W,I), APPEND(W,I), TMP2 +%assign I (I+1) +%endrep + ; Save digests for later addition + vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A + vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B + vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C + vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D + vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E + vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F + vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G + vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H + + ; MSG Schedule for W0-W15 is now complete in registers + ; Process first (max-rounds -16) + ; Calculate next Wt+16 after processing is complete and Wt is unneeded + + ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) + +%assign I 0 +%assign J 0 +%assign K 1 +%assign L 9 +%assign M 14 +%rep SHA_ROUNDS_LESS_16 + PROCESS_LOOP APPEND(W,J), I + MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) +%assign I (I+1) +%assign J ((J+1)% 16) +%assign K ((K+1)% 16) +%assign L ((L+1)% 16) +%assign M ((M+1)% 16) +%endrep + ; Check is this is the last block + sub SIZE, 1 + je lastLoop + + ; Process last 16 rounds + ; Read in next block msg data for use in first 16 words of msg sched +%assign I SHA_ROUNDS_LESS_16 +%assign J 0 +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + ; Add old digest + vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] + + jmp lloop + + +lastLoop: + ; Process last 16 rounds +%assign I SHA_ROUNDS_LESS_16 +%assign J 0 + +%rep 16 + PROCESS_LOOP APPEND(W,J), I +%assign I (I+1) +%assign J (J+1) +%endrep + + ; Add old digest + vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] + vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] + vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] + vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] + vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] + vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] + vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] + vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] + +;; update into data pointers +%assign I 0 +%rep 4 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + VMOVDQ32 [DIGEST + 0*8*8], A + VMOVDQ32 [DIGEST + 1*8*8], B + VMOVDQ32 [DIGEST + 2*8*8], C + VMOVDQ32 [DIGEST + 3*8*8], D + VMOVDQ32 [DIGEST + 4*8*8], E + VMOVDQ32 [DIGEST + 5*8*8], F + VMOVDQ32 [DIGEST + 6*8*8], G + VMOVDQ32 [DIGEST + 7*8*8], H + + mov rsp, [rsp + _RSP] + ret + + section .data +align 64 +; 80 constants for SHA512 +; replicating for each lane, thus 8*80 +; to aid in SIMD .. space tradeoff for time! +; local to asm file, used nowhere else +TABLE: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 + +align 64 +; this does the big endian to little endian conversion over a quad word .. ZMM +;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits +PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + dq 0x2021222324252627, 0x28292a2b2c2d2e2f + dq 0x3031323334353637, 0x38393a3b3c3d3e3f + +align 64 +TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_sha512_mb_x8_avx512 +no_sha512_mb_x8_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm new file mode 100644 index 000000000..e1186f8a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm @@ -0,0 +1,254 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define WRT_OPT wrt ..plt +%else +%define WRT_OPT +%endif + +%include "reg_sizes.asm" +%include "multibinary.asm" + +;;;;; +; mbin_dispatch_init_avoton parameters +; Use this function when SSE/00/01 is a minimum requirement +; if AVOTON is true, then use avoton_func instead of sse_func +; 1-> function name +; 2-> SSE/00/01 optimized function used as base +; 3-> AVX or AVX/02 opt func +; 4-> AVX2 or AVX/04 opt func +; 5-> AVOTON opt func +;;;;; +%macro mbin_dispatch_init_avoton 5 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 + + mov eax, 1 + cpuid + lea mbin_rdi, [%5 WRT_OPT] + and eax, FLAG_CPUID1_EAX_STEP_MASK + cmp eax, FLAG_CPUID1_EAX_AVOTON + ; If Avoton, set Avoton symbol and exit + cmove mbin_rsi, mbin_rdi + je _%1_init_done + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func + jne _%1_init_done ; AVX is not available so end + mov mbin_rsi, mbin_rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func + cmovne mbin_rsi, mbin_rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _%1_init_done + lea mbin_rsi, [%2 WRT_OPT] + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +;;;;; +; mbin_dispatch_init6_avoton parameters +; if AVOTON is true, then use avoton_func instead of sse_func +; 1-> function name +; 2-> base function +; 3-> SSE4_1 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVOTON opt func +;;;;; +%macro mbin_dispatch_init6_avoton 7 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_1 + je _%1_init_done ; Use base function if no SSE4_1 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + lea mbin_rdi, [%7 WRT_OPT] + and eax, FLAG_CPUID1_EAX_STEP_MASK + cmp eax, FLAG_CPUID1_EAX_AVOTON + ; If Avoton, set Avoton symbol and exit + cmove mbin_rsi, mbin_rdi + je _%1_init_done + + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_ECX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + +default rel +[bits 64] + +%define def_wrd dq +%define wrd_sz qword +%define arg1 rsi + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern sha512_ctx_mgr_init_sse +extern sha512_ctx_mgr_submit_sse +extern sha512_ctx_mgr_flush_sse + +extern sha512_ctx_mgr_init_avx +extern sha512_ctx_mgr_submit_avx +extern sha512_ctx_mgr_flush_avx + +extern sha512_ctx_mgr_init_avx2 +extern sha512_ctx_mgr_submit_avx2 +extern sha512_ctx_mgr_flush_avx2 + +%ifdef HAVE_AS_KNOWS_AVX512 + extern sha512_ctx_mgr_init_avx512 + extern sha512_ctx_mgr_submit_avx512 + extern sha512_ctx_mgr_flush_avx512 +%endif + +extern sha512_ctx_mgr_init_sb_sse4 +extern sha512_ctx_mgr_submit_sb_sse4 +extern sha512_ctx_mgr_flush_sb_sse4 + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface sha512_ctx_mgr_init +mbin_interface sha512_ctx_mgr_submit +mbin_interface sha512_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6 through replacing base by sse version + mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \ + sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \ + sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \ + sha512_ctx_mgr_init_sb_sse4 + + mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \ + sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \ + sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \ + sha512_ctx_mgr_submit_sb_sse4 + + mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \ + sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \ + sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \ + sha512_ctx_mgr_flush_sb_sse4 +%else + mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \ + sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \ + sha512_ctx_mgr_init_sb_sse4 + + mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \ + sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \ + sha512_ctx_mgr_submit_sb_sse4 + + mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \ + sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \ + sha512_ctx_mgr_flush_sb_sse4 +%endif + + +;;; func core, ver, snum +slversion sha512_ctx_mgr_init, 00, 03, 0175 +slversion sha512_ctx_mgr_submit, 00, 03, 0176 +slversion sha512_ctx_mgr_flush, 00, 03, 0177 diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c new file mode 100644 index 000000000..bb9a8f5e8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c @@ -0,0 +1,256 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include "sha512_mb.h" + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference SHA512 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +#define H0 0x6a09e667f3bcc908 +#define H1 0xbb67ae8584caa73b +#define H2 0x3c6ef372fe94f82b +#define H3 0xa54ff53a5f1d36f1 +#define H4 0x510e527fade682d1 +#define H5 0x9b05688c2b3e6c1f +#define H6 0x1f83d9abfb41bd6b +#define H7 0x5be0cd19137e2179 + +void sha512_single(const uint8_t * data, uint64_t digest[5]); + +void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len) +{ + uint32_t i, j; + uint8_t buf[2 * SHA512_BLOCK_SIZE]; + + /* 128 bit lengths not needed as len is uint32_t, so use 64 bit length + * and pad the first 64 bits with zeros. */ + union { + uint64_t uint; + uint8_t uchar[8]; + } convert; + uint8_t *p; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + digest[4] = H4; + digest[5] = H5; + digest[6] = H6; + digest[7] = H7; + + i = len; + /* Hash the complete blocks */ + while (i >= SHA512_BLOCK_SIZE) { + sha512_single(input_data, digest); + input_data += SHA512_BLOCK_SIZE; + i -= SHA512_BLOCK_SIZE; + } + + /* Copy remainder to a buffer to be padded */ + memcpy(buf, input_data, i); + buf[i++] = 0x80; + + // Pad more than required here and overwrite with length + for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++) + buf[j] = 0; + + if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE) + i = 2 * SHA512_BLOCK_SIZE; + else + i = SHA512_BLOCK_SIZE; + + convert.uint = 8 * len; + p = buf + i - 8; + p[0] = convert.uchar[7]; + p[1] = convert.uchar[6]; + p[2] = convert.uchar[5]; + p[3] = convert.uchar[4]; + p[4] = convert.uchar[3]; + p[5] = convert.uchar[2]; + p[6] = convert.uchar[1]; + p[7] = convert.uchar[0]; + + /* Hash the padded last block */ + sha512_single(buf, digest); + if (i == 256) + sha512_single(buf + 128, digest); +} + +/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words + * instead of 32 bit. + */ +#define ch(e,f,g) ((e & f) ^ (g & ~e)) +#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c)) + +/* Sigma functions have same form as SHA256 but + * - change the word size to 64bit + * - change the amount to rotate + */ +#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r)))) + +/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case + * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions. + */ +#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39)) +#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41)) + +#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7)) +#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6)) + +#define bswap(x) (((x) & (0xffull << 0)) << 56) \ + | (((x) & (0xffull << 8)) << 40) \ + | (((x) & (0xffull <<16)) << 24) \ + | (((x) & (0xffull <<24)) << 8) \ + | (((x) & (0xffull <<32)) >> 8) \ + | (((x) & (0xffull <<40)) >> 24) \ + | (((x) & (0xffull <<48)) >> 40) \ + | (((x) & (0xffull <<56)) >> 56) + +#define W(x) w[(x) & 15] + +#define step(i,a,b,c,d,e,f,g,h,k) \ + if (i<16) W(i) = bswap(ww[i]); \ + else \ + W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \ + t2 = s0(a) + maj(a,b,c); \ + t1 = h + s1(e) + ch(e,f,g) + k + W(i); \ + d += t1; \ + h = t1 + t2; + +void sha512_single(const uint8_t * data, uint64_t digest[5]) +{ + /* Check these are all uint64_t */ + uint64_t a, b, c, d, e, f, g, h, t1, t2; + uint64_t w[16]; + uint64_t *ww = (uint64_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + e = digest[4]; + f = digest[5]; + g = digest[6]; + h = digest[7]; + + step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22); + step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd); + step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f); + step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc); + step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538); + step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019); + step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b); + step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118); + step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242); + step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe); + step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c); + step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2); + step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f); + step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1); + step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235); + step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694); + step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2); + step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3); + step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5); + step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65); + step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275); + step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483); + step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4); + step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5); + step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab); + step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210); + step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f); + step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4); + step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2); + step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725); + step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f); + step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70); + step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc); + step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926); + step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed); + step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df); + step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de); + step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8); + step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6); + step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b); + step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364); + step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001); + step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791); + step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30); + step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218); + step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910); + step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a); + step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8); + step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8); + step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53); + step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99); + step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8); + step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63); + step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb); + step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373); + step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3); + step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc); + step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60); + step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72); + step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec); + step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28); + step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9); + step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915); + step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63 + step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c); + step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207); + step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e); + step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178); + step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba); + step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6); + step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae); + step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b); + step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84); + step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493); + step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc); + step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c); + step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6); + step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a); + step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec); + step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79 + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c new file mode 100644 index 000000000..6eeed19fd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c @@ -0,0 +1,46 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +/* + * Function: sha512_sb_mgr_flush_sse4. + * + * Description: This is a dummy API. Nothing done here. + * + * Return: always NULL. + * + * */ +SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state) +{ + return NULL; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c new file mode 100644 index 000000000..93ce88dfd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c @@ -0,0 +1,38 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "sha512_mb.h" + +// For single buffer APIs, nothing to be done here. +// This function is required, to comply with the usage of +// multi-buffer APIs. +void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state) +{ + return; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c new file mode 100644 index 000000000..659d14339 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c @@ -0,0 +1,65 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include "sha512_mb.h" + +/* + * Function: sha512_sb_mgr_submit_sse4 + * + * Description: Wrapper API for update routine of single buffer sha512, + * to comply with multi-buffer API. + * + * This function will pick up message/digest and length + * information from the argument "job", then call into + * sha512_sse4(). Argument "state" is passed in, but not + * really used here. + * + * Note: message init and padding is done outside. This function + * expects a packed buffer. + * + * Argument: state - not really used. + * job - contained message, digest, message length information, etc. + * + * Return: SHA512_JOB pointer. + * + **/ +SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job) +{ + assert(job != NULL); + + uint8_t *buff = job->buffer; + uint64_t *digest = job->result_digest, len = job->len; + + sha512_sse4((const void *)buff, (void *)digest, len); + + return job; +} diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm new file mode 100644 index 000000000..57598a0e2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm @@ -0,0 +1,394 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +default rel +BITS 64 +section .text + +; Virtual Registers +%ifidn __OUTPUT_FORMAT__, win64 + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifidn __OUTPUT_FORMAT__, win64 + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + ror tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + ror tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_sse 1 +%assign %%t (%1) + + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + mov T1, f_64 + movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2] + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 ; XMM0 = W[t-2] + xor T1, g_64 + add T1, [WK_2(%%t)] + movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov tmp0, e_64 + ror tmp0, 23 ; 41 + movdqa xmm3, xmm5 ; XMM3 = W[t-15] + xor tmp0, e_64 + ror tmp0, 4 ; 18 + psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42 + xor tmp0, e_64 + ror tmp0, 14 ; 14 + psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1 + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15] + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov tmp0, a_64 + ror tmp0, 5 ; 39 + pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor tmp0, a_64 + ror tmp0, 6 ; 34 + pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor tmp0, a_64 + ror tmp0, 28 ; 28 + psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 ; XMM1 = W[t-2] + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 ; XMM4 = W[t-15] + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42 + add T1, [WK_2(%%t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7 + ror tmp0, 23 ; 41 + xor tmp0, e_64 + pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2] + ror tmp0, 4 ; 18 + xor tmp0, e_64 + pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15] + ror tmp0, 14 ; 14 + add T1, tmp0 + psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 ; XMM0 = s1(W[t-2]) + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + xor T2, tmp0 + pxor xmm3, xmm4 ; XMM3 = s0(W[t-15]) + mov tmp0, a_64 + paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror tmp0, 5 ; 39 + paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor tmp0, a_64 + paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror tmp0, 6 ; 34 + movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords + xor tmp0, a_64 + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + ror tmp0, 28 ; 28 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +global sha512_sse4:function +sha512_sse4: + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + movdqa xmm1, [XMM_QWORD_BSWAP] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + movdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS; Compute 2 Rounds + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_sse t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifidn __OUTPUT_FORMAT__, win64 + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + -- cgit v1.2.3