diff options
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/md5_mb')
29 files changed, 7934 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am new file mode 100644 index 00000000..8001e431 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am @@ -0,0 +1,83 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc += md5_mb/md5_ctx_sse.c \ + md5_mb/md5_ctx_avx.c \ + md5_mb/md5_ctx_avx2.c + +lsrc += md5_mb/md5_mb_mgr_init_sse.c \ + md5_mb/md5_mb_mgr_init_avx2.c \ + md5_mb/md5_mb_mgr_init_avx512.c + +lsrc += md5_mb/md5_mb_mgr_submit_sse.asm \ + md5_mb/md5_mb_mgr_submit_avx.asm \ + md5_mb/md5_mb_mgr_submit_avx2.asm \ + md5_mb/md5_mb_mgr_flush_sse.asm \ + md5_mb/md5_mb_mgr_flush_avx.asm \ + md5_mb/md5_mb_mgr_flush_avx2.asm \ + md5_mb/md5_mb_x4x2_sse.asm \ + md5_mb/md5_mb_x4x2_avx.asm \ + md5_mb/md5_mb_x8x2_avx2.asm \ + md5_mb/md5_multibinary.asm + +lsrc += md5_mb/md5_mb_mgr_submit_avx512.asm \ + md5_mb/md5_mb_mgr_flush_avx512.asm \ + md5_mb/md5_mb_x16x2_avx512.asm \ + md5_mb/md5_ctx_avx512.c + +extern_hdrs += include/md5_mb.h \ + include/multi_buffer.h + +other_src += include/datastruct.asm \ + md5_mb/md5_job.asm \ + md5_mb/md5_mb_mgr_datastruct.asm \ + md5_mb/md5_ref.c \ + include/reg_sizes.asm \ + include/multibinary.asm \ + include/memcpy_inline.h \ + include/intrinreg.h + +check_tests += md5_mb/md5_mb_test \ + md5_mb/md5_mb_rand_test \ + md5_mb/md5_mb_rand_update_test + +unit_tests += md5_mb/md5_mb_rand_ssl_test + +perf_tests += md5_mb/md5_mb_vs_ossl_perf + + +md5_mb_rand_test: md5_ref.o +md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la +md5_mb_rand_update_test: md5_ref.o +md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la +md5_mb_rand_ssl_test: LDLIBS += -lcrypto +md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto +md5_mb_vs_ossl_perf: LDLIBS += -lcrypto +md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c new file mode 100644 index 00000000..2125be63 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (MD5_BLOCK_SIZE - 1); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx_slver_02020183; +struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 }; + +struct slver md5_ctx_mgr_submit_avx_slver_02020184; +struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 }; + +struct slver md5_ctx_mgr_flush_avx_slver_02020185; +struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 }; diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c new file mode 100644 index 00000000..71618a3c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx2(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (MD5_BLOCK_SIZE - 1); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx2_slver_04020186; +struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 }; + +struct slver md5_ctx_mgr_submit_avx2_slver_04020187; +struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 }; + +struct slver md5_ctx_mgr_flush_avx2_slver_04020188; +struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 }; diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c new file mode 100644 index 00000000..a7f54c2b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c @@ -0,0 +1,253 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +#ifdef HAVE_AS_KNOWS_AVX512 + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_avx512(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (MD5_BLOCK_SIZE - 1); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_avx512_slver_0600018c; +struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 }; + +struct slver md5_ctx_mgr_submit_avx512_slver_0600018d; +struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 }; + +struct slver md5_ctx_mgr_flush_avx512_slver_0600018e; +struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 }; + +#endif // HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c new file mode 100644 index 00000000..8688dfc3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" +#include "memcpy_inline.h" + +#ifdef _MSC_VER +#include <intrin.h> +#define inline __inline +#endif + +static inline void hash_init_digest(MD5_WORD_T * digest); +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len); +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx); + +void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr) +{ + md5_mb_mgr_init_sse(&mgr->mgr); +} + +MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx, + const void *buffer, uint32_t len, HASH_CTX_FLAG flags) +{ + if (flags & (~HASH_ENTIRE)) { + // User should not pass anything other than FIRST, UPDATE, or LAST + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + // Cannot submit to a currently processing job. + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + // Cannot update a finished job. + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + if (flags & HASH_FIRST) { + // Init digest + hash_init_digest(ctx->job.result_digest); + + // Reset byte counter + ctx->total_length = 0; + + // Clear extra blocks + ctx->partial_block_buffer_length = 0; + } + // If we made it here, there were no errors during this call to submit + ctx->error = HASH_CTX_ERROR_NONE; + + // Store buffer ptr info from user + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + // Store the user's request flags and mark this ctx as currently being processed. + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + // Advance byte counter + ctx->total_length += len; + + // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block. + // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block. + if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) { + // Compute how many bytes to copy from user buffer into extra block + uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + // Copy and update relevant pointers and counters + memcpy_varlen(&ctx->partial_block_buffer + [ctx->partial_block_buffer_length], buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + // The extra block should never contain more than 1 block here + assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE); + + // If the extra block buffer contains exactly 1 block, it can be hashed. + if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + } + } + + return md5_ctx_mgr_resubmit(mgr, ctx); +} + +MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr) +{ + MD5_HASH_CTX *ctx; + + while (1) { + ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr); + + // If flush returned 0, there are no more jobs in flight. + if (!ctx) + return NULL; + + // If flush returned a job, verify that it is safe to return to the user. + // If it is not ready, resubmit the job to finish processing. + ctx = md5_ctx_mgr_resubmit(mgr, ctx); + + // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned. + if (ctx) + return ctx; + + // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop. + } +} + +static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx) +{ + while (ctx) { + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit + return ctx; + } + // If the extra blocks are empty, begin hashing what remains in the user's buffer. + if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) { + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + + // Only entire blocks can be hashed. Copy remainder to extra blocks buffer. + uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1); + + if (copy_len) { + len -= copy_len; + //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len); + memcpy_varlen(ctx->partial_block_buffer, + ((const char *)buffer + len), copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + // len should be a multiple of the block size now + assert((len % MD5_BLOCK_SIZE) == 0); + + // Set len to the number of blocks to be hashed in the user's buffer + len >>= MD5_LOG2_BLOCK_SIZE; + + if (len) { + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, + &ctx->job); + continue; + } + } + // If the extra blocks are not empty, then we are either on the last block(s) + // or we need more user input before continuing. + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length); + + ctx->status = + (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE); + + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static inline void hash_init_digest(MD5_WORD_T * digest) +{ + static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] = + { MD5_INITIAL_DIGEST }; + //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest)); + memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest)); +} + +static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len) +{ + uint32_t i = total_len & (MD5_BLOCK_SIZE - 1); + + // memset(&padblock[i], 0, MD5_BLOCK_SIZE); + memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 + + MD5_PADLENGTHFIELD_SIZE; + + *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3); + + return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; +struct slver md5_ctx_mgr_init_sse_slver_00020180; +struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 }; + +struct slver md5_ctx_mgr_submit_sse_slver_00020181; +struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 }; + +struct slver md5_ctx_mgr_flush_sse_slver_00020182; +struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm new file mode 100644 index 00000000..9f4c510c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm @@ -0,0 +1,55 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define constants +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define STS_UNKNOWN 0 +%define STS_BEING_PROCESSED 1 +%define STS_COMPLETED 2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define JOB_MD5 structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; JOB_MD5 + +;;; name size align +FIELD _buffer, 8, 8 ; pointer to buffer +FIELD _len, 4, 4 ; length in bytes +FIELD _result_digest, 4*4, 64 ; Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +%assign _JOB_MD5_size _FIELD_OFFSET +%assign _JOB_MD5_align _STRUCT_ALIGN diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm new file mode 100644 index 00000000..63743cef --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm @@ -0,0 +1,73 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "datastruct.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Define MD5 Out Of Order Data Structures +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; LANE_DATA +;;; name size align +FIELD _job_in_lane, 8, 8 ; pointer to job object +END_FIELDS + +%assign _LANE_DATA_size _FIELD_OFFSET +%assign _LANE_DATA_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MD5_ARGS_X32 +;;; name size align +FIELD _digest, 4*4*32, 16 ; transposed digest +FIELD _data_ptr, 8*32, 8 ; array of pointers to data +END_FIELDS + +%assign _MD5_ARGS_X8_size _FIELD_OFFSET +%assign _MD5_ARGS_X8_align _STRUCT_ALIGN +%assign _MD5_ARGS_X16_size _FIELD_OFFSET +%assign _MD5_ARGS_X16_align _STRUCT_ALIGN +%assign _MD5_ARGS_X32_size _FIELD_OFFSET +%assign _MD5_ARGS_X32_align _STRUCT_ALIGN +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +START_FIELDS ; MB_MGR +;;; name size align +FIELD _args, _MD5_ARGS_X8_size, _MD5_ARGS_X8_align +FIELD _lens, 4*32, 8 +FIELD _unused_lanes, 8*4, 8 +FIELD _ldata, _LANE_DATA_size*32, _LANE_DATA_align +FIELD _num_lanes_inuse, 4, 4 +END_FIELDS + +%assign _MB_MGR_size _FIELD_OFFSET +%assign _MB_MGR_align _STRUCT_ALIGN + +_args_digest equ _args + _digest +_args_data_ptr equ _args + _data_ptr diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm new file mode 100644 index 00000000..7b681136 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm @@ -0,0 +1,243 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_avx +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_avx +%define idx r8 + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +global md5_mb_mgr_flush_avx:function +md5_mb_mgr_flush_avx: + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; if bit (32+3) is set, then all lanes are empty + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm new file mode 100644 index 00000000..ecc28319 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm @@ -0,0 +1,251 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x8x2_avx2 +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x8x2_avx2 +%define idx rbp + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%define num_lanes_inuse r8 + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +global md5_mb_mgr_flush_avx2:function +md5_mb_mgr_flush_avx2: + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 15 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 16 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x8x2_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm new file mode 100644 index 00000000..e8d4ca03 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm @@ -0,0 +1,313 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern md5_mb_x16x2_avx512 +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x16_avx512 +%define idx rbp + +%define unused_lanes ymm7 +%define lane r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%define num_lanes_inuse r8 + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +;; Byte shift in MEM addr, read a extra byte [addr+16] +%macro MEM_VPSRLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr + 1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr + 31], byte 0 +%endmacro + +;; Byte shift in MEM addr, read a extra byte [addr-1] +%macro MEM_VPSLLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr-1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr], byte 0 +%endmacro + +align 64 +default rel +section .text + +; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +global md5_mb_mgr_flush_avx512:function +md5_mb_mgr_flush_avx512: + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6 + vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7 + vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8 + vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9 + vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10 + vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11 + vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12 + vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13 + vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14 + vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + cmp num_lanes_inuse, 0 + jz return_null + + ; find a lane with a non-null job + xor idx, idx +%assign I 1 +%rep 31 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [APPEND(lane_,I)] +%assign I (I+1) +%endrep + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 32 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + ; Find min length + vmovdqu ymm5, [state + _lens + 2*32] + vmovdqu ymm6, [state + _lens + 3*32] + + vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C} + vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E} + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + + vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0x3F + shr len2, 6 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_6bits] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + vpsubd ymm5, ymm5, ymm2 + vpsubd ymm6, ymm6, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + vmovdqu [state + _lens + 2*32], ymm5 + vmovdqu [state + _lens + 3*32], ymm6 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x16x2_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov lane, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + + shl lane, 8 + or lane, idx + MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes + mov [state + _unused_lanes], lane + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0] + vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1] + vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2] + vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3] + vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4] + vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5] + vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6] + vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7] + vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8] + vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_6bits: + dq 0x00000000FFFFFFC0, 0x0000000000000000 + dq 0x00000000FFFFFFC0, 0x0000000000000000 +lane_1: dq 1 +lane_2: dq 2 +lane_3: dq 3 +lane_4: dq 4 +lane_5: dq 5 +lane_6: dq 6 +lane_7: dq 7 +lane_8: dq 8 +lane_9: dq 9 +lane_10: dq 10 +lane_11: dq 11 +lane_12: dq 12 +lane_13: dq 13 +lane_14: dq 14 +lane_15: dq 15 +lane_16: dq 16 +lane_17: dq 17 +lane_18: dq 18 +lane_19: dq 19 +lane_20: dq 20 +lane_21: dq 21 +lane_22: dq 22 +lane_23: dq 23 +lane_24: dq 24 +lane_25: dq 25 +lane_26: dq 26 +lane_27: dq 27 +lane_28: dq 28 +lane_29: dq 29 +lane_30: dq 30 +lane_31: dq 31 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_mgr_flush_avx512 +no_md5_mb_mgr_flush_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm new file mode 100644 index 00000000..7ee81616 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm @@ -0,0 +1,244 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_sse +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, elf64 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; UN*X register definitions +%define arg1 rdi ; rcx +%define arg2 rsi ; rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%else + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif + +; Common register definitions + +%define state arg1 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_sse +%define idx r8 + +%define unused_lanes r9 + +%define lane_data r10 + +%define job_rax rax +%define tmp rax + +%endif ;; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE equ 10*16 +_GPR_SAVE_SIZE equ 8*8 +_ALIGN_SIZE equ 8 + +_XMM_SAVE equ 0 +_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +%define APPEND(a,b) a %+ b + +; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state) +; arg 1 : rcx : state +global md5_mb_mgr_flush_sse:function +md5_mb_mgr_flush_sse: + sub rsp, STACK_SPACE + mov [rsp + _GPR_SAVE + 8*0], rbx + mov [rsp + _GPR_SAVE + 8*3], rbp + mov [rsp + _GPR_SAVE + 8*4], r12 + mov [rsp + _GPR_SAVE + 8*5], r13 + mov [rsp + _GPR_SAVE + 8*6], r14 + mov [rsp + _GPR_SAVE + 8*7], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _GPR_SAVE + 8*1], rsi + mov [rsp + _GPR_SAVE + 8*2], rdi + movdqa [rsp + _XMM_SAVE + 16*0], xmm6 + movdqa [rsp + _XMM_SAVE + 16*1], xmm7 + movdqa [rsp + _XMM_SAVE + 16*2], xmm8 + movdqa [rsp + _XMM_SAVE + 16*3], xmm9 + movdqa [rsp + _XMM_SAVE + 16*4], xmm10 + movdqa [rsp + _XMM_SAVE + 16*5], xmm11 + movdqa [rsp + _XMM_SAVE + 16*6], xmm12 + movdqa [rsp + _XMM_SAVE + 16*7], xmm13 + movdqa [rsp + _XMM_SAVE + 16*8], xmm14 + movdqa [rsp + _XMM_SAVE + 16*9], xmm15 +%endif + + ; if bit (32+3) is set, then all lanes are empty + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [one] + cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [two] + cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [three] + cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [four] + cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [five] + cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [six] + cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [seven] + + ; copy idx to empty lanes +copy_lane_data: + mov tmp, [state + _args + _data_ptr + 8*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args + _data_ptr + 8*I], tmp + mov dword [state + _lens + 4*I], 0xFFFFFFFF +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + movdqa xmm0, [state + _lens + 0*16] + movdqa xmm1, [state + _lens + 1*16] + + movdqa xmm2, xmm0 + pminud xmm2, xmm1 ; xmm2 has {D,C,B,A} + palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + pminud xmm2, xmm3 ; xmm2 has {x,x,E,F} + palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + pminud xmm2, xmm3 ; xmm2 has min value in low dword + + movd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + pand xmm2, [rel clear_low_nibble] + pshufd xmm2, xmm2, 0 + + psubd xmm0, xmm2 + psubd xmm1, xmm2 + + movdqa [state + _lens + 0*16], xmm0 + movdqa [state + _lens + 1*16], xmm1 + + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + movd xmm0, [state + _args_digest + 4*idx + 0*32] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + _XMM_SAVE + 16*0] + movdqa xmm7, [rsp + _XMM_SAVE + 16*1] + movdqa xmm8, [rsp + _XMM_SAVE + 16*2] + movdqa xmm9, [rsp + _XMM_SAVE + 16*3] + movdqa xmm10, [rsp + _XMM_SAVE + 16*4] + movdqa xmm11, [rsp + _XMM_SAVE + 16*5] + movdqa xmm12, [rsp + _XMM_SAVE + 16*6] + movdqa xmm13, [rsp + _XMM_SAVE + 16*7] + movdqa xmm14, [rsp + _XMM_SAVE + 16*8] + movdqa xmm15, [rsp + _XMM_SAVE + 16*9] + mov rsi, [rsp + _GPR_SAVE + 8*1] + mov rdi, [rsp + _GPR_SAVE + 8*2] +%endif + mov rbx, [rsp + _GPR_SAVE + 8*0] + mov rbp, [rsp + _GPR_SAVE + 8*3] + mov r12, [rsp + _GPR_SAVE + 8*4] + mov r13, [rsp + _GPR_SAVE + 8*5] + mov r14, [rsp + _GPR_SAVE + 8*6] + mov r15, [rsp + _GPR_SAVE + 8*7] + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c new file mode 100644 index 00000000..b2e98336 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0xfedcba9876543210; + state->num_lanes_inuse = 0; + for (j = 0; j < 16; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c new file mode 100644 index 00000000..e83b2e38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c @@ -0,0 +1,44 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0x0706050403020100; + state->unused_lanes[1] = 0x0f0e0d0c0b0a0908; + state->unused_lanes[2] = 0x1716151413121110; + state->unused_lanes[3] = 0x1f1e1d1c1b1a1918; + state->num_lanes_inuse = 0; + for (j = 0; j < 32; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c new file mode 100644 index 00000000..049d2147 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c @@ -0,0 +1,40 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "md5_mb.h" + +void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state) +{ + unsigned int j; + state->unused_lanes[0] = 0xF76543210; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = 0; + } +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm new file mode 100644 index 00000000..5663942b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm @@ -0,0 +1,222 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" +default rel + +extern md5_mb_x4x2_avx + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_avx +%define idx r8 + +%define p r9 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane r10 + +%define lane_data r11 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global md5_mb_mgr_submit_avx:function +md5_mb_mgr_submit_avx: + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*32], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens + 0*16] + vmovdqa xmm1, [state + _lens + 1*16] + + vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A} + vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F} + vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand xmm2, xmm2, [rel clear_low_nibble] + vpshufd xmm2, xmm2, 0 + + vpsubd xmm0, xmm0, xmm2 + vpsubd xmm1, xmm1, xmm2 + + vmovdqa [state + _lens + 0*16], xmm0 + vmovdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*32] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm new file mode 100644 index 00000000..9279b855 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm @@ -0,0 +1,235 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x8x2_avx2 +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define lane rsi + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%define lane rdx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2 +%define idx rbp + +%define p r11 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define num_lanes_inuse r9 + +%define lane_data r10 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global md5_mb_mgr_submit_avx2:function +md5_mb_mgr_submit_avx2: + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*64], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 16 + jne return_null + +start_loop: + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_nibble] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x8x2_avx2 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*64] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm new file mode 100644 index 00000000..40102ccc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm @@ -0,0 +1,280 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +extern md5_mb_x16x2_avx512 +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%define lane rsi + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%define lane rdx + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in a register not clobberred by md5_mb_x16_avx512 +%define idx rbp + +%define p r11 + +%define unused_lanes ymm7 + +%define job_rax rax +%define len rax + +%define num_lanes_inuse r9 + +%define lane_data r10 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +;; Byte shift in MEM addr, read a extra byte [addr+16] +%macro MEM_VPSRLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr + 1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr + 31], byte 0 +%endmacro + +;; Byte shift in MEM addr, read a extra byte [addr-1] +%macro MEM_VPSLLDDQ 2 +%define %%addr %1 +%define %%TMP_YMM %2 + vmovdqu %%TMP_YMM, [%%addr-1] + vmovdqu [%%addr], %%TMP_YMM + mov [%%addr], byte 0 +%endmacro + +align 64 +default rel +section .text +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global md5_mb_mgr_submit_avx512:function +md5_mb_mgr_submit_avx512: + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + vmovdqa [rsp + 8*8 + 16*0], xmm6 + vmovdqa [rsp + 8*8 + 16*1], xmm7 + vmovdqa [rsp + 8*8 + 16*2], xmm8 + vmovdqa [rsp + 8*8 + 16*3], xmm9 + vmovdqa [rsp + 8*8 + 16*4], xmm10 + vmovdqa [rsp + 8*8 + 16*5], xmm11 + vmovdqa [rsp + 8*8 + 16*6], xmm12 + vmovdqa [rsp + 8*8 + 16*7], xmm13 + vmovdqa [rsp + 8*8 + 16*8], xmm14 + vmovdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov lane, [state + _unused_lanes] + and lane, 0x3F + MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov DWORD(len), [job + _len] + + shl len, 6 ; low 5 bits store idx + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + vmovdqu xmm0, [job + _result_digest + 0*16] + vmovd [state + _args_digest + 4*lane + 0*4*16*2], xmm0 + vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1 + vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2 + vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + add num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + cmp num_lanes_inuse, 32 + jne return_null + +start_loop: + ; Find min length + vmovdqu ymm0, [state + _lens + 0*32] + vmovdqu ymm1, [state + _lens + 1*32] + + vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C} + vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E} + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed + vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword + + ; Find min length + vmovdqu ymm5, [state + _lens + 2*32] + vmovdqu ymm6, [state + _lens + 3*32] + + vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A} + vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C} + vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F} + vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E} + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed + vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword + + vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword + vmovd DWORD(idx), xmm2 + mov len2, idx + and idx, 0x3F + shr len2, 6 + jz len_is_0 + + vpand ymm2, ymm2, [rel clear_low_6bits] + vpshufd ymm2, ymm2, 0 + + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm2 + vpsubd ymm5, ymm5, ymm2 + vpsubd ymm6, ymm6, ymm2 + + vmovdqu [state + _lens + 0*32], ymm0 + vmovdqu [state + _lens + 1*32], ymm1 + vmovdqu [state + _lens + 2*32], ymm5 + vmovdqu [state + _lens + 3*32], ymm6 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x16x2_avx512 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov lane, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + + shl lane, 8 + or lane, idx + MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes + mov [state + _unused_lanes], lane + + mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse] + sub num_lanes_inuse, 1 + mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse) + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2] + vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1 + vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2 + vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3 + + vmovdqa [job_rax + _result_digest + 0*16], xmm0 + +return: +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqa xmm6, [rsp + 8*8 + 16*0] + vmovdqa xmm7, [rsp + 8*8 + 16*1] + vmovdqa xmm8, [rsp + 8*8 + 16*2] + vmovdqa xmm9, [rsp + 8*8 + 16*3] + vmovdqa xmm10, [rsp + 8*8 + 16*4] + vmovdqa xmm11, [rsp + 8*8 + 16*5] + vmovdqa xmm12, [rsp + 8*8 + 16*6] + vmovdqa xmm13, [rsp + 8*8 + 16*7] + vmovdqa xmm14, [rsp + 8*8 + 16*8] + vmovdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=32 + +align 32 +clear_low_6bits: + dq 0x00000000FFFFFFC0, 0x0000000000000000 + dq 0x00000000FFFFFFC0, 0x0000000000000000 + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_mgr_submit_avx512 +no_md5_mb_mgr_submit_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm new file mode 100644 index 00000000..e1511858 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm @@ -0,0 +1,223 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_job.asm" +%include "md5_mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +extern md5_mb_x4x2_sse +default rel + +%if 1 +%ifidn __OUTPUT_FORMAT__, win64 +; WINDOWS register definitions +%define arg1 rcx +%define arg2 rdx + +%else +; UN*X register definitions +%define arg1 rdi +%define arg2 rsi + +%endif + +; Common definitions +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx must be a register not clobberred by md5_mb_x4x2_sse +%define idx r8 + +%define p r9 + +%define unused_lanes rbx + +%define job_rax rax +%define len rax + +%define lane r10 + +%define lane_data r11 + +%endif ; if 1 + +; STACK_SPACE needs to be an odd multiple of 8 +%define STACK_SPACE 8*8 + 16*10 + 8 + +; JOB* submit_job(MB_MGR *state, JOB_MD5 *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +global md5_mb_mgr_submit_sse:function +md5_mb_mgr_submit_sse: + + sub rsp, STACK_SPACE + ; we need to save/restore all GPRs because lower layer clobbers them + mov [rsp + 8*0], rbx + mov [rsp + 8*1], rbp + mov [rsp + 8*2], r12 + mov [rsp + 8*3], r13 + mov [rsp + 8*4], r14 + mov [rsp + 8*5], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 8*6], rsi + mov [rsp + 8*7], rdi + movdqa [rsp + 8*8 + 16*0], xmm6 + movdqa [rsp + 8*8 + 16*1], xmm7 + movdqa [rsp + 8*8 + 16*2], xmm8 + movdqa [rsp + 8*8 + 16*3], xmm9 + movdqa [rsp + 8*8 + 16*4], xmm10 + movdqa [rsp + 8*8 + 16*5], xmm11 + movdqa [rsp + 8*8 + 16*6], xmm12 + movdqa [rsp + 8*8 + 16*7], xmm13 + movdqa [rsp + 8*8 + 16*8], xmm14 + movdqa [rsp + 8*8 + 16*9], xmm15 +%endif + + mov unused_lanes, [state + _unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _LANE_DATA_size + mov dword [job + _status], STS_BEING_PROCESSED + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov DWORD(len), [job + _len] + + shl len, 4 + or len, lane + + mov [lane_data + _job_in_lane], job + mov [state + _lens + 4*lane], DWORD(len) + + ; Load digest words from result_digest + movdqu xmm0, [job + _result_digest + 0*16] + movd [state + _args_digest + 4*lane + 0*32], xmm0 + pextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1 + pextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2 + pextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3 + + mov p, [job + _buffer] + mov [state + _args_data_ptr + 8*lane], p + + cmp unused_lanes, 0xF + jne return_null + +start_loop: + ; Find min length + movdqa xmm0, [state + _lens + 0*16] + movdqa xmm1, [state + _lens + 1*16] + + movdqa xmm2, xmm0 + pminud xmm2, xmm1 ; xmm2 has {D,C,B,A} + palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C} + pminud xmm2, xmm3 ; xmm2 has {x,x,E,F} + palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E} + pminud xmm2, xmm3 ; xmm2 has min value in low dword + + movd DWORD(idx), xmm2 + mov len2, idx + and idx, 0xF + shr len2, 4 + jz len_is_0 + + pand xmm2, [rel clear_low_nibble] + pshufd xmm2, xmm2, 0 + + psubd xmm0, xmm2 + psubd xmm1, xmm2 + + movdqa [state + _lens + 0*16], xmm0 + movdqa [state + _lens + 1*16], xmm1 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_mb_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + mov dword [job_rax + _status], STS_COMPLETED + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov dword [state + _lens + 4*idx], 0xFFFFFFFF + + movd xmm0, [state + _args_digest + 4*idx + 0*32] + pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1 + pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2 + pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3 + + movdqa [job_rax + _result_digest + 0*16], xmm0 + +return: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + 8*8 + 16*0] + movdqa xmm7, [rsp + 8*8 + 16*1] + movdqa xmm8, [rsp + 8*8 + 16*2] + movdqa xmm9, [rsp + 8*8 + 16*3] + movdqa xmm10, [rsp + 8*8 + 16*4] + movdqa xmm11, [rsp + 8*8 + 16*5] + movdqa xmm12, [rsp + 8*8 + 16*6] + movdqa xmm13, [rsp + 8*8 + 16*7] + movdqa xmm14, [rsp + 8*8 + 16*8] + movdqa xmm15, [rsp + 8*8 + 16*9] + mov rsi, [rsp + 8*6] + mov rdi, [rsp + 8*7] +%endif + mov rbx, [rsp + 8*0] + mov rbp, [rsp + 8*1] + mov r12, [rsp + 8*2] + mov r13, [rsp + 8*3] + mov r14, [rsp + 8*4] + mov r15, [rsp + 8*5] + + add rsp, STACK_SPACE + + ret + +return_null: + xor job_rax, job_rax + jmp return + + +section .data align=16 + +align 16 +clear_low_nibble: + dq 0x00000000FFFFFFF0, 0x0000000000000000 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c new file mode 100644 index 00000000..5efeda71 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c @@ -0,0 +1,151 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/md5.h> +#include "md5_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 200 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS]; + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, fail = 0; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + + printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + srand(TEST_SEED); + + posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + md5_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // SSL test + MD5(bufs[i], TEST_LEN, digest_ssl[i]); + + // sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + ((uint32_t *) digest_ssl[i])[j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Ramdom buffer with ramdom len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run SSL test + MD5(bufs[i], lens[i], digest_ssl[i]); + + // Run sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != + ((uint32_t *) digest_ssl[i])[j]) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + ((uint32_t *) digest_ssl[i])[j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_ssl rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c new file mode 100644 index 00000000..451bcbc1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c @@ -0,0 +1,196 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "md5_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS]; + +// Compare against reference function +extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + uint32_t i, j, fail = 0; + unsigned char *bufs[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int jobs, t; + uint8_t *tmp_buf; + + printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN); + + posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + md5_ctx_mgr_init(mgr); + + srand(TEST_SEED); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocate and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contexts + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], TEST_LEN); + + // Run sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d " + "fail 0x%08X <=> 0x%08X \n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + md5_ctx_mgr_init(mgr); + + for (i = 0; i < jobs; i++) { + // Use buffer with random len and contents + lens[i] = rand() % (TEST_LEN); + rand_buffer(bufs[i], lens[i]); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], lens[i]); + + // Run md5_mb test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail " + "0x%08X <=> 0x%08X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + // Test at the end of buffer + jobs = rand() % TEST_BUFS; + tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs); + if (!tmp_buf) { + printf("malloc failed, end test aborted.\n"); + return 1; + } + + rand_buffer(tmp_buf, jobs); + + md5_ctx_mgr_init(mgr); + + // Extend to the end of allocated buffer to construct jobs + for (i = 0; i < jobs; i++) { + bufs[i] = (uint8_t *) & tmp_buf[i]; + lens[i] = jobs - i; + + // Reference test + md5_ref(bufs[i], digest_ref[i], lens[i]); + + // sb_md5 test + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE); + } + + while (md5_ctx_mgr_flush(mgr)) ; + + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("End test failed at offset %d - result: 0x%08X" + ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + + putchar('.'); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5 rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c new file mode 100644 index 00000000..4737a94b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c @@ -0,0 +1,291 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include "md5_mb.h" + +#define TEST_LEN (1024*1024) +#define TEST_BUFS 100 +#ifndef RANDOMS +# define RANDOMS 10 +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +#define UPDATE_SIZE 13*MD5_BLOCK_SIZE +#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*MD5_BLOCK_SIZE)) + +#ifdef DEBUG +# define debug_char(x) putchar(x) +#else +# define debug_char(x) do {} while (0) +#endif + +/* Reference digest global to reduce stack usage */ +static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS]; + +extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len); + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, const long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL; + uint32_t i, j, fail = 0; + int len_done, len_rem, len_rand; + unsigned char *bufs[TEST_BUFS]; + unsigned char *buf_ptr[TEST_BUFS]; + uint32_t lens[TEST_BUFS]; + unsigned int joblen, jobs, t; + + printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, + TEST_LEN); + + srand(TEST_SEED); + + posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + md5_ctx_mgr_init(mgr); + + for (i = 0; i < TEST_BUFS; i++) { + // Allocte and fill buffer + bufs[i] = (unsigned char *)malloc(TEST_LEN); + buf_ptr[i] = bufs[i]; + if (bufs[i] == NULL) { + printf("malloc failed test aborted\n"); + return 1; + } + rand_buffer(bufs[i], TEST_LEN); + + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + + // Run reference test + md5_ref(bufs[i], digest_ref[i], TEST_LEN); + } + + // Run sb_md5 tests + for (i = 0; i < TEST_BUFS;) { + len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_done == 0) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_FIRST); + else if (len_rem <= UPDATE_SIZE) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + // Add jobs while available or finished + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + } + + // Start flushing finished jobs, end on last flushed + ctx = md5_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = md5_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] += UPDATE_SIZE; + + len_done = (int)((unsigned long)buf_ptr[i] + - (unsigned long)bufs[i]); + len_rem = TEST_LEN - len_done; + + if (len_rem <= UPDATE_SIZE) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], UPDATE_SIZE, HASH_UPDATE); + + if (ctx == NULL) + ctx = md5_ctx_mgr_flush(mgr); + } + + // Check digests + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d fixed size, digest%d fail %8X <=> %8X", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + putchar('.'); + + // Run tests with random size and number of jobs + for (t = 0; t < RANDOMS; t++) { + jobs = rand() % (TEST_BUFS); + + for (i = 0; i < jobs; i++) { + joblen = rand() % (TEST_LEN); + rand_buffer(bufs[i], joblen); + lens[i] = joblen; + buf_ptr[i] = bufs[i]; + md5_ref(bufs[i], digest_ref[i], lens[i]); + } + + md5_ctx_mgr_init(mgr); + + // Run md5_sb jobs + i = 0; + while (i < jobs) { + // Submit a new job + len_rand = MD5_BLOCK_SIZE + + MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS); + + if (lens[i] > len_rand) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_FIRST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], lens[i], HASH_ENTIRE); + + // Returned ctx could be: + // - null context (we are just getting started and lanes aren't full yet), or + // - finished already (an ENTIRE we submitted or a previous LAST is returned), or + // - an unfinished ctx, we will resubmit + + if ((ctx == NULL) || hash_ctx_complete(ctx)) { + i++; + continue; + } else { + // unfinished ctx returned, choose another random update length and submit either + // UPDATE or LAST depending on the amount of buffer remaining + while ((ctx != NULL) && !(hash_ctx_complete(ctx))) { + j = (unsigned long)(ctx->user_data); // Get index of the returned ctx + buf_ptr[j] = bufs[j] + ctx->total_length; + len_rand = (rand() % MD5_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + len_rem = lens[j] - ctx->total_length; + + if (len_rem <= len_rand) // submit the rest of the job as LAST + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rem, HASH_LAST); + else // submit the random update length as UPDATE + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[j], + buf_ptr[j], + len_rand, + HASH_UPDATE); + } // Either continue submitting any contexts returned here as UPDATE/LAST, or + // go back to submitting new jobs using the index i. + + i++; + } + } + + // Start flushing finished jobs, end on last flushed + ctx = md5_ctx_mgr_flush(mgr); + while (ctx) { + if (hash_ctx_complete(ctx)) { + debug_char('-'); + ctx = md5_ctx_mgr_flush(mgr); + continue; + } + // Resubmit unfinished job + i = (unsigned long)(ctx->user_data); + buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer + len_rem = lens[i] - ctx->total_length; + len_rand = (rand() % MD5_BLOCK_SIZE) + * (rand() % MAX_RAND_UPDATE_BLOCKS); + debug_char('+'); + if (len_rem <= len_rand) + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rem, HASH_LAST); + else + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + buf_ptr[i], len_rand, HASH_UPDATE); + + if (ctx == NULL) + ctx = md5_ctx_mgr_flush(mgr); + } + + // Check result digest + for (i = 0; i < jobs; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) { + fail++; + printf("Test%d, digest%d fail %8X <=> %8X\n", + i, j, ctxpool[i].job.result_digest[j], + digest_ref[i][j]); + } + } + } + if (fail) { + printf("Test failed function check %d\n", fail); + return fail; + } + + putchar('.'); + fflush(0); + } // random test t + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_update rand: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c new file mode 100644 index 00000000..bd1ad8e0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c @@ -0,0 +1,223 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "md5_mb.h" + +typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS]; + +#define MSGS 13 +#define NUM_JOBS 1000 + +#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS + +static uint8_t msg1[] = "Test vector from febooti.com"; +static uint8_t msg2[] = "12345678901234567890" "12345678901234567890" + "12345678901234567890" "12345678901234567890"; +static uint8_t msg3[] = ""; +static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz"; +static uint8_t msg5[] = "message digest"; +static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789"; +static uint8_t msg7[] = "abc"; +static uint8_t msg8[] = "a"; + +static uint8_t msg9[] = ""; +static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz"; +static uint8_t msgB[] = "message digest"; +static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789"; +static uint8_t msgD[] = "abc"; + +static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff }; +static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 }; +static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec }; +static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca }; +static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa }; +static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f }; +static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 }; +static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 }; + +static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec }; +static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca }; +static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa }; +static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f }; +static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 }; + +static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9, + msgA, msgB, msgC, msgD +}; + +static uint32_t *expResultDigest[MSGS] = { + expResultDigest1, expResultDigest2, expResultDigest3, + expResultDigest4, expResultDigest5, expResultDigest6, + expResultDigest7, expResultDigest8, expResultDigest9, + expResultDigestA, expResultDigestB, expResultDigestC, + expResultDigestD +}; + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL; + uint32_t i, j, k, t, checked = 0; + uint32_t *good; + + posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + md5_ctx_mgr_init(mgr); + + // Init contexts before first use + for (i = 0; i < MSGS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + for (i = 0; i < MSGS; i++) { + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], msgs[i], + strlen((char *)msgs[i]), HASH_ENTIRE); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + + } + } + + while (1) { + ctx = md5_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + good = expResultDigest[t]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + // do larger test in pseudo-random order + + // Init contexts before first use + for (i = 0; i < NUM_JOBS; i++) { + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + checked = 0; + for (i = 0; i < NUM_JOBS; i++) { + j = PSEUDO_RANDOM_NUM(i); + ctx = md5_ctx_mgr_submit(mgr, + &ctxpool[i], + msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE); + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the" + " submit. Error code: %d", ctx->error); + return -1; + } + + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + } + } + while (1) { + ctx = md5_ctx_mgr_flush(mgr); + + if (ctx) { + t = (unsigned long)(ctx->user_data); + k = PSEUDO_RANDOM_NUM(t); + good = expResultDigest[k]; + checked++; + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (good[j] != ctxpool[t].job.result_digest[j]) { + printf("Test %d, digest %d is %08X, should be %08X\n", + t, j, ctxpool[t].job.result_digest[j], good[j]); + return -1; + } + } + + if (ctx->error) { + printf("Something bad happened during the submit." + " Error code: %d", ctx->error); + return -1; + } + } else { + break; + } + } + + if (checked != NUM_JOBS) { + printf("only tested %d rather than %d\n", checked, NUM_JOBS); + return -1; + } + + printf(" multibinary_md5 test: Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c new file mode 100644 index 00000000..7e9acde2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c @@ -0,0 +1,123 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <openssl/md5.h> +#include "md5_mb.h" +#include "test.h" + +// Set number of outstanding jobs +#define TEST_BUFS 32 + +#ifdef CACHED_TEST +// Loop many times over same data +# define TEST_LEN 4*1024 +# define TEST_LOOPS 10000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (GT_L3_CACHE / TEST_BUFS) +# define TEST_LOOPS 100 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS + +/* Reference digest global to reduce stack usage */ +static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS]; + +int main(void) +{ + MD5_HASH_CTX_MGR *mgr = NULL; + MD5_HASH_CTX ctxpool[TEST_BUFS]; + unsigned char *bufs[TEST_BUFS]; + uint32_t i, j, t, fail = 0; + struct perf start, stop; + + for (i = 0; i < TEST_BUFS; i++) { + bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1); + if (bufs[i] == NULL) { + printf("calloc failed test aborted\n"); + return 1; + } + // Init ctx contents + hash_ctx_init(&ctxpool[i]); + ctxpool[i].user_data = (void *)((uint64_t) i); + } + + posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR)); + md5_ctx_mgr_init(mgr); + + // Start OpenSSL tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + MD5(bufs[i], TEST_LEN, digest_ssl[i]); + } + perf_stop(&stop); + + printf("md5_openssl" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + // Start mb tests + perf_start(&start); + for (t = 0; t < TEST_LOOPS; t++) { + for (i = 0; i < TEST_BUFS; i++) + md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE); + + while (md5_ctx_mgr_flush(mgr)) ; + } + perf_stop(&stop); + + printf("multibinary_md5" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i * t); + + for (i = 0; i < TEST_BUFS; i++) { + for (j = 0; j < MD5_DIGEST_NWORDS; j++) { + if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) { + fail++; + printf("Test%d, digest%d fail %08X <=> %08X\n", + i, j, ctxpool[i].job.result_digest[j], + ((uint32_t *) digest_ssl[i])[j]); + } + } + } + + printf("Multi-buffer md5 test complete %d buffers of %d B with " + "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS); + + if (fail) + printf("Test failed function check %d\n", fail); + else + printf(" multibinary_md5_ossl_perf: Pass\n"); + + return fail; +} diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm new file mode 100644 index 00000000..7ce64140 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm @@ -0,0 +1,850 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 +default rel + +;; code to compute double octal MD5 using AVX512 + +;; Stack must be aligned to 64 bytes before call + +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers zmm0-8, 14-31 + +;; clobbers all GPRs other than arg1 and rbp + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx ; arg0 + %define arg2 rdx ; arg1 + %define reg3 r8 ; arg2 + %define reg4 r9 ; arg3 + %define var1 rdi + %define var2 rsi + %define local_func_decl(func_name) global func_name + %else + %define arg1 rdi ; arg0 + %define arg2 rsi ; arg1 + %define var1 rdx ; arg2 + %define var2 rcx ; arg3 + %define local_func_decl(func_name) global func_name:function internal +%endif + +%define state arg1 +%define num_blks arg2 + +%define IN (state + _data_ptr) +%define DIGEST state +%define SIZE num_blks +;; These are pointers to data block1 and block2 in the stack +; which will ping pong back and forth +%define DPTR1 rbx +%define DPTR2 var2 +%define IDX var1 +%define TBL rax + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +;; Transposed Digest Storage +%define A zmm0 +%define B zmm1 +%define C zmm2 +%define D zmm3 +%define A1 zmm4 +%define B1 zmm5 +%define C1 zmm6 +%define D1 zmm7 + +%define md5c zmm16 + +%define MASK0 zmm17 +%define MASK1 zmm18 + +%define TMP0 zmm20 +%define TMP1 zmm21 + + +;; Data are stored into the Wx after transposition +%define W0 zmm8 +%define W1 zmm9 +%define W2 zmm10 +%define W3 zmm11 +%define W4 zmm12 +%define W5 zmm13 +%define W6 zmm14 +%define W7 zmm15 + +%define W8 zmm24 +%define W9 zmm25 +%define W10 zmm26 +%define W11 zmm27 +%define W12 zmm28 +%define W13 zmm29 +%define W14 zmm30 +%define W15 zmm31 + +%define MD5_DIGEST_ROW_SIZE (16*4) +%define APPEND(a,b) a %+ b +%define APPEND3(a,b,c) a %+ b %+ c + +;; Temporary registers used during data transposition + +%define RESZ resb 64* +;; Assume stack aligned to 64 bytes before call +;; Therefore FRAMESIZE mod 64 must be 64-8 = 56 +struc STACK +_DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2 +_TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily +_RSP_SAVE: RESQ 1 ; original RSP +endstruc + +%define Z_AA rsp + _DIGEST + 64*0 +%define Z_BB rsp + _DIGEST + 64*1 +%define Z_CC rsp + _DIGEST + 64*2 +%define Z_DD rsp + _DIGEST + 64*3 +%define Z_AA1 rsp + _DIGEST + 64*4 +%define Z_BB1 rsp + _DIGEST + 64*5 +%define Z_CC1 rsp + _DIGEST + 64*6 +%define Z_DD1 rsp + _DIGEST + 64*7 + +%define MD5_DIGEST_ROW_SIZE (32*4) + + +;; +;; MD5 left rotations (number of bits) +;; +%define rot11 7 +%define rot12 12 +%define rot13 17 +%define rot14 22 +%define rot21 5 +%define rot22 9 +%define rot23 14 +%define rot24 20 +%define rot31 4 +%define rot32 11 +%define rot33 16 +%define rot34 23 +%define rot41 6 +%define rot42 10 +%define rot43 15 +%define rot44 21 + +%macro TRANSPOSE16 18 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 + +; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} + +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} + + ; use r2 in place of t0 + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} + + ; use r6 in place of t0 + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} + + ; use r10 in place of t0 + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} + +;; At this point, the registers that contain interesting data are: +;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 +;; Can use t1 and r14 as scratch registers + + vmovdqa32 %%r14, MASK0 + vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqa32 %%t1, MASK1 + vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqa32 %%r2, MASK0 + vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqa32 %%t0, MASK1 + vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqa32 %%r3, MASK0 + vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r7, MASK1 + vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqa32 %%r1, MASK0 + vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqa32 %%r5, MASK1 + vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r0, MASK0 + vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} + vmovdqa32 %%r4, MASK1 + vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} + + vmovdqa32 %%r6, MASK0 + vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} + vmovdqa32 %%r10, MASK1 + vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} + + vmovdqa32 %%r11, MASK0 + vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} + vmovdqa32 %%r15, MASK1 + vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} + + vmovdqa32 %%r9, MASK0 + vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} + vmovdqa32 %%r13, MASK1 + vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} + +;; At this point r8 and r12 can be used as scratch registers + + vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + + vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + + vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + + vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + + vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_ARGS1 0 +%xdefine TMP_ D1 +%xdefine D1 C1 +%xdefine C1 B1 +%xdefine B1 A1 +%xdefine A1 TMP_ +%endm + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot) +;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx +%macro PROCESS_LOOP 6 +%define %%MD5const %1 +%define %%data %2 +%define %%F_IMMED %3 +%define %%NROT %4 +%define %%TMP_PR0 %5 +%define %%TMP_PR1 %6 + ; a=b+((a+Ft(b,c,d)+Mj+ti)<<s) + + ; Ft + ; 0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z) 0xca + ; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z)) 0xe4 + ; 32-47 Ft:H(X,Y,Z)=X^Y^Z 0x96 + ; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z)) 0x39 + + vpaddd A, A, %%MD5const + vpaddd A1, A1, %%MD5const + vpaddd A, A, [%%data] + vpaddd A1, A1, [%%data + 16*64] + vmovdqa32 %%TMP_PR0, B ; Copy B + vmovdqa32 %%TMP_PR1, B1 ; Copy B + vpternlogd %%TMP_PR0, C, D, %%F_IMMED + vpternlogd %%TMP_PR1, C1, D1, %%F_IMMED + vpaddd A, A, %%TMP_PR0 + vpaddd A1, A1, %%TMP_PR1 + vprold A, A, %%NROT + vprold A1, A1, %%NROT + vpaddd A, A, B + vpaddd A1, A1, B1 + + ROTATE_ARGS + ROTATE_ARGS1 +%endmacro + +align 64 +default rel +section .text + +; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) + +local_func_decl(md5_mb_x16x2_avx512) +md5_mb_x16x2_avx512: + mov rax, rsp + sub rsp, STACK_size + and rsp, -64 + mov [rsp + _RSP_SAVE], rax + + mov DPTR1, rsp + lea DPTR2, [rsp + 64*32] + + ;; Load MD5 constant pointer to register + lea TBL, [MD5_TABLE] + vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1] + vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2] + + ;; Preload input data from 16 segments. + xor IDX, IDX + + ;; transpose input onto stack + ;; first 16 lanes read + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + ;; first 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR1+_DATA+(0)*64],W0 + vmovdqa32 [DPTR1+_DATA+(1)*64],W1 + vmovdqa32 [DPTR1+_DATA+(2)*64],W2 + vmovdqa32 [DPTR1+_DATA+(3)*64],W3 + vmovdqa32 [DPTR1+_DATA+(4)*64],W4 + vmovdqa32 [DPTR1+_DATA+(5)*64],W5 + vmovdqa32 [DPTR1+_DATA+(6)*64],W6 + vmovdqa32 [DPTR1+_DATA+(7)*64],W7 + vmovdqa32 [DPTR1+_DATA+(8)*64],W8 + vmovdqa32 [DPTR1+_DATA+(9)*64],W9 + vmovdqa32 [DPTR1+_DATA+(10)*64],W10 + vmovdqa32 [DPTR1+_DATA+(11)*64],W11 + vmovdqa32 [DPTR1+_DATA+(12)*64],W12 + vmovdqa32 [DPTR1+_DATA+(13)*64],W13 + vmovdqa32 [DPTR1+_DATA+(14)*64],W14 + vmovdqa32 [DPTR1+_DATA+(15)*64],W15 + + ;; second 16 lanes read + mov inp0, [IN + 16*8] + mov inp1, [IN + 17*8] + mov inp2, [IN + 18*8] + mov inp3, [IN + 19*8] + mov inp4, [IN + 20*8] + mov inp5, [IN + 21*8] + mov inp6, [IN + 22*8] + mov inp7, [IN + 23*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 24*8] + mov inp1, [IN + 25*8] + mov inp2, [IN + 26*8] + mov inp3, [IN + 27*8] + mov inp4, [IN + 28*8] + mov inp5, [IN + 29*8] + mov inp6, [IN + 30*8] + mov inp7, [IN + 31*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + ;; second 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0 + vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1 + vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2 + vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3 + vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4 + vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5 + vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6 + vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7 + vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8 + vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9 + vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10 + vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11 + vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12 + vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13 + vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14 + vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15 + + ;; Initialize digests + ;; vmovdqu32 replace vmovdqa32 + vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE] + vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE] + ; Load the digest for each stream (9-16) + vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64] + vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64] + +.lloop: + ;; Increment IDX to point to next data block (64 bytes per block) + add IDX, 64 + + ; Save digests for later addition + vmovdqa32 [Z_AA], A + vmovdqa32 [Z_BB], B + vmovdqa32 [Z_CC], C + vmovdqa32 [Z_DD], D + vmovdqa32 [Z_AA1], A1 + vmovdqa32 [Z_BB1], B1 + vmovdqa32 [Z_CC1], C1 + vmovdqa32 [Z_DD1], D1 + + sub SIZE, 1 + je .LastLoop + +%assign I 0 +%assign I_fimm 0xCA +%rep 16 ; 0<=I<=15 + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + %assign I_data I + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + ;; first 16 lanes read + mov inp0, [IN + 0*8] + mov inp1, [IN + 1*8] + mov inp2, [IN + 2*8] + mov inp3, [IN + 3*8] + mov inp4, [IN + 4*8] + mov inp5, [IN + 5*8] + mov inp6, [IN + 6*8] + mov inp7, [IN + 7*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 8*8] + mov inp1, [IN + 9*8] + mov inp2, [IN +10*8] + mov inp3, [IN +11*8] + mov inp4, [IN +12*8] + mov inp5, [IN +13*8] + mov inp6, [IN +14*8] + mov inp7, [IN +15*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + +%assign I 16 +%assign I_fimm 0xE4 +%rep 16 ; 16<=I<=31 + %assign I_data ((5*I+1) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; first 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR2+_DATA+(0)*64],W0 + vmovdqa32 [DPTR2+_DATA+(1)*64],W1 + vmovdqa32 [DPTR2+_DATA+(2)*64],W2 + vmovdqa32 [DPTR2+_DATA+(3)*64],W3 + vmovdqa32 [DPTR2+_DATA+(4)*64],W4 + vmovdqa32 [DPTR2+_DATA+(5)*64],W5 + vmovdqa32 [DPTR2+_DATA+(6)*64],W6 + vmovdqa32 [DPTR2+_DATA+(7)*64],W7 + vmovdqa32 [DPTR2+_DATA+(8)*64],W8 + vmovdqa32 [DPTR2+_DATA+(9)*64],W9 + vmovdqa32 [DPTR2+_DATA+(10)*64],W10 + vmovdqa32 [DPTR2+_DATA+(11)*64],W11 + vmovdqa32 [DPTR2+_DATA+(12)*64],W12 + vmovdqa32 [DPTR2+_DATA+(13)*64],W13 + vmovdqa32 [DPTR2+_DATA+(14)*64],W14 + vmovdqa32 [DPTR2+_DATA+(15)*64],W15 + +%assign I 32 +%assign I_fimm 0x96 +%rep 16 ; 32<=I<=47 + %assign I_data ((3*I+5) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; second 16 lanes read + mov inp0, [IN + 16*8] + mov inp1, [IN + 17*8] + mov inp2, [IN + 18*8] + mov inp3, [IN + 19*8] + mov inp4, [IN + 20*8] + mov inp5, [IN + 21*8] + mov inp6, [IN + 22*8] + mov inp7, [IN + 23*8] + vmovdqu32 W0,[inp0+IDX] + vmovdqu32 W1,[inp1+IDX] + vmovdqu32 W2,[inp2+IDX] + vmovdqu32 W3,[inp3+IDX] + vmovdqu32 W4,[inp4+IDX] + vmovdqu32 W5,[inp5+IDX] + vmovdqu32 W6,[inp6+IDX] + vmovdqu32 W7,[inp7+IDX] + mov inp0, [IN + 24*8] + mov inp1, [IN + 25*8] + mov inp2, [IN + 26*8] + mov inp3, [IN + 27*8] + mov inp4, [IN + 28*8] + mov inp5, [IN + 29*8] + mov inp6, [IN + 30*8] + mov inp7, [IN + 31*8] + vmovdqu32 W8, [inp0+IDX] + vmovdqu32 W9, [inp1+IDX] + vmovdqu32 W10,[inp2+IDX] + vmovdqu32 W11,[inp3+IDX] + vmovdqu32 W12,[inp4+IDX] + vmovdqu32 W13,[inp5+IDX] + vmovdqu32 W14,[inp6+IDX] + vmovdqu32 W15,[inp7+IDX] + +%assign I 48 +%assign I_fimm 0x39 +%rep 16 ; 48<=I<=63 + %assign I_rotX (I/16+1) + %assign I_rotY (I % 4 + 1) + %assign I_data ((7*I) % 16) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ;; second 16 lanes trans&write + TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 + vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0 + vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1 + vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2 + vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3 + vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4 + vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5 + vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6 + vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7 + vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8 + vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9 + vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10 + vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11 + vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12 + vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13 + vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14 + vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15 + + ; Add old digest + vpaddd A,A,[Z_AA] + vpaddd B,B,[Z_BB] + vpaddd C,C,[Z_CC] + vpaddd D,D,[Z_DD] + vpaddd A1,A1,[Z_AA1] + vpaddd B1,B1,[Z_BB1] + vpaddd C1,C1,[Z_CC1] + vpaddd D1,D1,[Z_DD1] + + ; Swap DPTR1 and DPTR2 + xchg DPTR1, DPTR2 + ;; Proceed to processing of next block + jmp .lloop + +.LastLoop: +%assign I 0 +%assign I_fimm 0xCA +%rep 16 ; 0<=I<=15 + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + %assign I_data I + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 16 +%assign I_fimm 0xE4 +%rep 16 ; 16<=I<=31 + %assign I_data ((5*I+1) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 32 +%assign I_fimm 0x96 +%rep 16 ; 32<=I<=47 + %assign I_data ((3*I+5) % 16) + %assign I_rotX I/16+1 + %assign I_rotY (I % 4 + 1) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + +%assign I 48 +%assign I_fimm 0x39 +%rep 16 ; 48<=I<=63 + %assign I_rotX (I/16+1) + %assign I_rotY (I % 4 + 1) + %assign I_data ((7*I) % 16) + vpbroadcastd md5c, [TBL + I * 4] + PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1 + %assign I (I+1) +%endrep + + ; Add old digest + vpaddd A,A,[Z_AA] + vpaddd B,B,[Z_BB] + vpaddd C,C,[Z_CC] + vpaddd D,D,[Z_DD] + vpaddd A1,A1,[Z_AA1] + vpaddd B1,B1,[Z_BB1] + vpaddd C1,C1,[Z_CC1] + vpaddd D1,D1,[Z_DD1] + + ;; update into data pointers +%assign I 0 +%rep 16 + mov inp0, [IN + (2*I)*8] + mov inp1, [IN + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [IN + (2*I)*8], inp0 + mov [IN + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A + vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B + vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C + vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D + ; Store the digest for each stream (9-16) + vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1 + vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1 + vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1 + vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1 + + mov rsp, [rsp + _RSP_SAVE] + ret + +section .data +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee + dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 + dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be + dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa + dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 + dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed + dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c + dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 + dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 + dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 + dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 + dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 + dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 + +PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + +%else +%ifidn __OUTPUT_FORMAT__, win64 +global no_md5_mb_x16x2_avx512 +no_md5_mb_x16x2_avx512: +%endif +%endif ; HAVE_AS_KNOWS_AVX512 diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm new file mode 100644 index 00000000..1b492790 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm @@ -0,0 +1,782 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +;; code to compute octal MD5 using AVX + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + ;movdqa %%F,%%Z + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + ;movdqa %%F,%%Z + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + ;movdqa %%F,%%Z + vpxor %%F,%%Z,[ONES] ; pnot %%F + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + ;movdqa %%tmp, %%reg + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, [%%data] + vpaddd %%A2, %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + vpaddd %%A, %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + vpaddd %%A2, %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vmovdqa %%TMP,[%%data] + vmovdqa %%TMP2,[%%data + 16*16] + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, %%TMP + vpaddd %%A2, %%A2, %%TMP2 + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + vpaddd %%A, %%A, %%FUN + vpaddd %%A2, %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +%ifidn __OUTPUT_FORMAT__, elf64 +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define inp7 rcx +%define mem1 rdx +%else +;; Windows Registers +%define arg1 rcx +%define arg2 rdx +%define inp7 rdi +%define mem1 rsi +%endif +; r8 is not used + +; Common definitions +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 +%define TBL rax +%define IDX rbx +%define mem2 rbp + + + + + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +MEM equ 16*16*2*2 ; two blocks of data stored in stack +; STACK_SIZE must be an odd multiple of 8 bytes in size +STACK_SIZE equ MEM + 16*8 + 8 + +%define AA rsp + MEM + 16*0 +%define BB rsp + MEM + 16*1 +%define CC rsp + MEM + 16*2 +%define DD rsp + MEM + 16*3 +%define AA2 rsp + MEM + 16*4 +%define BB2 rsp + MEM + 16*5 +%define CC2 rsp + MEM + 16*6 +%define DD2 rsp + MEM + 16*7 + +;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word + +;#define NUM_MD5_DIGEST_WORDS 4 +;#define NUM_LANES 8 +;#define MD5_BLOCK_SIZE 64 +; +;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES]; +; +;typedef struct { +; DECLARE_ALIGNED(digest_array digest, 16); +; UINT8* data_ptr[NUM_LANES]; +;} MD5_ARGS_X8; + +; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size) +; arg 1 : pointer to MD5_ARGS_X8 structure +; arg 2 : size (in blocks) ;; assumed to be >= 1 +; +; arg1 and r8 are maintained by this function +; +align 32 +global md5_mb_x4x2_avx:function internal +md5_mb_x4x2_avx: + sub rsp, STACK_SIZE + + ;; Initialize digests + vmovdqu A,[arg1+0*16] + vmovdqu B,[arg1+2*16] + vmovdqu C,[arg1+4*16] + vmovdqu D,[arg1+6*16] + + vmovdqu A2,[arg1+1*16] + vmovdqu B2,[arg1+3*16] + vmovdqu C2,[arg1+5*16] + vmovdqu D2,[arg1+7*16] + + lea TBL, [MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + mov inp4,[arg1 + _data_ptr + 4*8] + mov inp5,[arg1 + _data_ptr + 5*8] + mov inp6,[arg1 + _data_ptr + 6*8] + mov inp7,[arg1 + _data_ptr + 7*8] + + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16],T0 + vmovdqa [mem1+(I*4+1)*16],T1 + vmovdqa [mem1+(I*4+2)*16],T2 + vmovdqa [mem1+(I*4+3)*16],T3 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + + ; save old digests + vmovdqa [AA], A + vmovdqa [BB], B + vmovdqa [CC], C + vmovdqa [DD], D + ; save old digests + vmovdqa [AA2], A2 + vmovdqa [BB2], B2 + vmovdqa [CC2], C2 + vmovdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14 + + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44 + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; write out digests + vmovdqu [arg1+0*16], A + vmovdqu [arg1+2*16], B + vmovdqu [arg1+4*16], C + vmovdqu [arg1+6*16], D + + vmovdqu [arg1+1*16], A2 + vmovdqu [arg1+3*16], B2 + vmovdqu [arg1+5*16], C2 + vmovdqu [arg1+7*16], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + mov [arg1 + _data_ptr + 1*8], inp1 + mov [arg1 + _data_ptr + 2*8], inp2 + mov [arg1 + _data_ptr + 3*8], inp3 + mov [arg1 + _data_ptr + 4*8], inp4 + mov [arg1 + _data_ptr + 5*8], inp5 + mov [arg1 + _data_ptr + 6*8], inp6 + mov [arg1 + _data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +section .data align=64 + +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm new file mode 100644 index 00000000..f3fc29ec --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm @@ -0,0 +1,778 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +; clobbers all XMM registers +; clobbers all GPRs except arg1 and r8 + +;; code to compute octal MD5 using SSE + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movdqa %%t0, %%r0 + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movdqa %%t1, %%r2 + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movdqa %%r1, %%t0 + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movdqa %%r3, %%r0 + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pand %%F,%%X + pxor %%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pxor %%F,%%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,[ONES] ; pnot %%F + por %%F,%%X + pxor %%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%tmp, (32-%%imm) + pslld %%reg, %%imm + por %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + paddd %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + paddd %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + paddd %%A, %%FUN + paddd %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +%ifidn __OUTPUT_FORMAT__, elf64 +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define inp7 rcx +%define mem1 rdx +%else +;; Windows Registers +%define arg1 rcx +%define arg2 rdx +%define inp7 rdi +%define mem1 rsi +%endif +; r8 is not used + +; Common definitions +%define inp0 r9 +%define inp1 r10 +%define inp2 r11 +%define inp3 r12 +%define inp4 r13 +%define inp5 r14 +%define inp6 r15 + +%define TBL rax +%define IDX rbx +%define mem2 rbp + + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +MEM equ 16*16*2*2 ; two blocks of data stored in stack +; STACK_SIZE must be an odd multiple of 8 bytes in size +STACK_SIZE equ MEM + 16*8 + 8 + +%define AA rsp + MEM + 16*0 +%define BB rsp + MEM + 16*1 +%define CC rsp + MEM + 16*2 +%define DD rsp + MEM + 16*3 +%define AA2 rsp + MEM + 16*4 +%define BB2 rsp + MEM + 16*5 +%define CC2 rsp + MEM + 16*6 +%define DD2 rsp + MEM + 16*7 + +;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word + +;#define NUM_MD5_DIGEST_WORDS 4 +;#define NUM_LANES 8 +;#define MD5_BLOCK_SIZE 64 +; +;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES]; +; +;typedef struct { +; DECLARE_ALIGNED(digest_array digest, 16); +; UINT8* data_ptr[NUM_LANES]; +;} MD5_ARGS_X8; + +; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size) +; arg 1 : pointer to MD5_ARGS_X8 structure +; arg 2 : size (in blocks) ;; assumed to be >= 1 +; +; arg1 and r8 are maintained by this function +; +align 32 +global md5_mb_x4x2_sse:function internal +md5_mb_x4x2_sse: + sub rsp, STACK_SIZE + + ;; Initialize digests + movdqu A,[arg1+0*16] + movdqu B,[arg1+2*16] + movdqu C,[arg1+4*16] + movdqu D,[arg1+6*16] + + ;; Initialize digests + movdqu A2,[arg1+1*16] + movdqu B2,[arg1+3*16] + movdqu C2,[arg1+5*16] + movdqu D2,[arg1+7*16] + + lea TBL, [MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1 + _data_ptr + 0*8] + mov inp1,[arg1 + _data_ptr + 1*8] + mov inp2,[arg1 + _data_ptr + 2*8] + mov inp3,[arg1 + _data_ptr + 3*8] + mov inp4,[arg1 + _data_ptr + 4*8] + mov inp5,[arg1 + _data_ptr + 5*8] + mov inp6,[arg1 + _data_ptr + 6*8] + mov inp7,[arg1 + _data_ptr + 7*8] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16],T0 + movdqa [mem1+(I*4+1)*16],T1 + movdqa [mem1+(I*4+2)*16],T2 + movdqa [mem1+(I*4+3)*16],T3 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16 + 16*16],T0 + movdqa [mem1+(I*4+1)*16 + 16*16],T1 + movdqa [mem1+(I*4+2)*16 + 16*16],T2 + movdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + movdqa [AA], A + movdqa [BB], B + movdqa [CC], C + movdqa [DD], D + ; save old digests + movdqa [AA2], A2 + movdqa [BB2], B2 + movdqa [CC2], C2 + movdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; write out digests + movdqu [arg1+0*16], A + movdqu [arg1+2*16], B + movdqu [arg1+4*16], C + movdqu [arg1+6*16], D + movdqu [arg1+1*16], A2 + movdqu [arg1+3*16], B2 + movdqu [arg1+5*16], C2 + movdqu [arg1+7*16], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 + _data_ptr + 0*8], inp0 + mov [arg1 + _data_ptr + 1*8], inp1 + mov [arg1 + _data_ptr + 2*8], inp2 + mov [arg1 + _data_ptr + 3*8], inp3 + mov [arg1 + _data_ptr + 4*8], inp4 + mov [arg1 + _data_ptr + 5*8], inp5 + mov [arg1 + _data_ptr + 6*8], inp6 + mov [arg1 + _data_ptr + 7*8], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +section .data align=64 + +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm new file mode 100644 index 00000000..818c0ebb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm @@ -0,0 +1,917 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "md5_mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +default rel + +;; code to compute double octal MD5 using AVX2 + +;; Stack must be aligned to 32 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers ymm0-15 + +;; clobbers all GPRs other than arg1 and rbp + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx + %define arg2 rdx + %define reg3 rdi + %define reg4 rsi +%else + %define arg1 rdi + %define arg2 rsi + %define reg3 rcx + %define reg4 rdx +%endif + +;; rbp is not clobbered + +%define state arg1 +%define num_blks arg2 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +;; These are pointers to data block1 and block2 in the stack +; which will ping pong back and forth +%define DPTR1 rbx +%define DPTR2 reg3 + +%define TBL rax +%define IDX reg4 + +;; Transposed Digest Storage +%define Y_A ymm0 +%define Y_B ymm1 +%define Y_C ymm2 +%define Y_D ymm3 +%define Y_A2 ymm4 +%define Y_B2 ymm5 +%define Y_C2 ymm6 +%define Y_D2 ymm7 + +;; Temp YMM registers corresponding to the Temp XMM registers +;; used during the transposition of the digests +%define Y_KTMP1 ymm12 +%define Y_KTMP2 ymm13 +;; Temporary registers used during MD5 round operations +%define Y_FUN ymm8 +%define Y_TMP ymm9 +%define Y_FUN2 ymm10 +%define Y_TMP2 ymm11 + + +;; YMM registers used during data fetching. +;; Data are stored into the stack after transposition +%define Y_DAT0 ymm8 +%define Y_DAT1 ymm9 +%define Y_DAT2 ymm10 +%define Y_DAT3 ymm11 +%define Y_DAT4 ymm12 +%define Y_DAT5 ymm13 +%define Y_DAT6 ymm14 +%define Y_DAT7 ymm15 + +;; Temporary registers used during data transposition +%define Y_DTMP1 ymm0 +%define Y_DTMP2 ymm1 + + +%define RESY resb 32* +;; Assume stack aligned to 32 bytes before call +;; Therefore FRAMESIZE mod 32 must be 32-8 = 24 +struc STACK +_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2 +_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily +_RSP_SAVE: RESQ 1 ; original RSP +endstruc + + +%define Y_AA rsp + _DIGEST + 32*0 +%define Y_BB rsp + _DIGEST + 32*1 +%define Y_CC rsp + _DIGEST + 32*2 +%define Y_DD rsp + _DIGEST + 32*3 +%define Y_AA2 rsp + _DIGEST + 32*4 +%define Y_BB2 rsp + _DIGEST + 32*5 +%define Y_CC2 rsp + _DIGEST + 32*6 +%define Y_DD2 rsp + _DIGEST + 32*7 + +%define MD5_DIGEST_ROW_SIZE (16*4) + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +; "transpose" data in {r0...r7} using temps {t0...t1} +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + +; +%macro TRANSPOSE8 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 + + ; process top half (r0..r3) {a...d} + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + + ; use r2 in place of t0 + ; process bottom half (r4..r7) {e...h} + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + + vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 + vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 + vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 + vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 + vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 + vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 + vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 + vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 +%endmacro + + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z,[ONES] ; pnot %%F + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%rA %2 +%define %%rB %3 +%define %%rC %4 +%define %%rD %5 +%define %%rA2 %6 +%define %%rB2 %7 +%define %%rC2 %8 +%define %%rD2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vpaddd %%rA, %%rA, %%MD5const + vpaddd %%rA2, %%rA2, %%MD5const + vpaddd %%rA, %%rA, [%%data] + vpaddd %%rA2, %%rA2, [%%data + 16*32] + %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD + %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2 + vpaddd %%rA, %%rA, %%FUN + vpaddd %%rA2, %%rA2, %%FUN2 + PROLD %%rA,%%nrot, %%TMP + PROLD %%rA2,%%nrot, %%TMP2 + vpaddd %%rA, %%rA, %%rB + vpaddd %%rA2, %%rA2, %%rB2 +%endmacro + +align 32 + +; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) + +global md5_mb_x8x2_avx2:function internal +md5_mb_x8x2_avx2: + mov rax, rsp + sub rsp, STACK_size + and rsp, -32 + mov [rsp + _RSP_SAVE], rax + + mov DPTR1, rsp + lea DPTR2, [rsp + 32*32] + + ;; Load MD5 constant pointer to register + lea TBL, [MD5_TABLE] + + ; Initialize index for data retrieval + xor IDX, IDX + + ;; Fetch Pointers to Data Stream 1 to 8 + mov inp0,[state + _data_ptr + 0*8] + mov inp1,[state + _data_ptr + 1*8] + mov inp2,[state + _data_ptr + 2*8] + mov inp3,[state + _data_ptr + 3*8] + mov inp4,[state + _data_ptr + 4*8] + mov inp5,[state + _data_ptr + 5*8] + mov inp6,[state + _data_ptr + 6*8] + mov inp7,[state + _data_ptr + 7*8] + +%assign I 0 +%rep 2 + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7 + +%assign I (I+1) +%endrep + + ;; Fetch Pointers to Data Stream 9 to 16 + mov inp0,[state + _data_ptr + 8*8] + mov inp1,[state + _data_ptr + 9*8] + mov inp2,[state + _data_ptr + 10*8] + mov inp3,[state + _data_ptr + 11*8] + mov inp4,[state + _data_ptr + 12*8] + mov inp5,[state + _data_ptr + 13*8] + mov inp6,[state + _data_ptr + 14*8] + mov inp7,[state + _data_ptr + 15*8] + +%assign I 0 +%rep 2 + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7 + +%assign I (I+1) +%endrep + ;; digests are already transposed + vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ] + vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ] + + ; Load the digest for each stream (9-16) + vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32] + vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32] + +lloop: + + ; save old digests to stack + vmovdqa [Y_AA], Y_A + vmovdqa [Y_BB], Y_B + vmovdqa [Y_CC], Y_C + vmovdqa [Y_DD], Y_D + + vmovdqa [Y_AA2], Y_A2 + vmovdqa [Y_BB2], Y_B2 + vmovdqa [Y_CC2], Y_C2 + vmovdqa [Y_DD2], Y_D2 + + ;; Increment IDX to point to next data block (64 bytes per block) + add IDX, 64 + + ;; Update size of remaining blocks to process + sub num_blks, 1 + je lastblock + + ; Perform the 64 rounds of processing ... + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 + + + ;; Fetch Pointers to Data Stream 1 to 8 ?? + mov inp0,[state + _data_ptr + 0*8] + mov inp1,[state + _data_ptr + 1*8] + mov inp2,[state + _data_ptr + 2*8] + mov inp3,[state + _data_ptr + 3*8] + mov inp4,[state + _data_ptr + 4*8] + mov inp5,[state + _data_ptr + 5*8] + mov inp6,[state + _data_ptr + 6*8] + mov inp7,[state + _data_ptr + 7*8] + + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 + +%assign I 0 + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 + +%assign I (I+1) + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 + + ;; Fetch Pointers to Data Stream 9 to 16 + mov inp0,[state + _data_ptr + 8*8] + mov inp1,[state + _data_ptr + 9*8] + mov inp2,[state + _data_ptr + 10*8] + mov inp3,[state + _data_ptr + 11*8] + mov inp4,[state + _data_ptr + 12*8] + mov inp5,[state + _data_ptr + 13*8] + mov inp6,[state + _data_ptr + 14*8] + mov inp7,[state + _data_ptr + 15*8] + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 + +%assign I 0 + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 + +%assign I (I+1) + + ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 + ; Therefore we need to save these to stack and restore after transpose + vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A + vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B + + vmovdqu Y_DAT0,[inp0+IDX+I*32] + vmovdqu Y_DAT1,[inp1+IDX+I*32] + vmovdqu Y_DAT2,[inp2+IDX+I*32] + vmovdqu Y_DAT3,[inp3+IDX+I*32] + vmovdqu Y_DAT4,[inp4+IDX+I*32] + vmovdqu Y_DAT5,[inp5+IDX+I*32] + vmovdqu Y_DAT6,[inp6+IDX+I*32] + vmovdqu Y_DAT7,[inp7+IDX+I*32] + TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 + vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 + vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 + vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 + vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 + vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 + vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 + vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 + vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 + + ; Restore Y_A and Y_B + vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] + vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] + + ; Add results to old digest values + + vpaddd Y_A,Y_A,[Y_AA] + vpaddd Y_B,Y_B,[Y_BB] + vpaddd Y_C,Y_C,[Y_CC] + vpaddd Y_D,Y_D,[Y_DD] + + vpaddd Y_A2,Y_A2,[Y_AA2] + vpaddd Y_B2,Y_B2,[Y_BB2] + vpaddd Y_C2,Y_C2,[Y_CC2] + vpaddd Y_D2,Y_D2,[Y_DD2] + + ; Swap DPTR1 and DPTR2 + xchg DPTR1, DPTR2 + + ;; Proceed to processing of next block + jmp lloop + +lastblock: + + ; Perform the 64 rounds of processing ... + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 + MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 + MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 + MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 + MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 + + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 + MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 + MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 + MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 + MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 + + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 + MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 + MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 + MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 + MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 + + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 + MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 + MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 + MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 + MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 + + ;; update into data pointers +%assign I 0 +%rep 8 + mov inp0, [state + _data_ptr + (2*I)*8] + mov inp1, [state + _data_ptr + (2*I +1)*8] + add inp0, IDX + add inp1, IDX + mov [state + _data_ptr + (2*I)*8], inp0 + mov [state + _data_ptr + (2*I+1)*8], inp1 +%assign I (I+1) +%endrep + + vpaddd Y_A,Y_A,[Y_AA] + vpaddd Y_B,Y_B,[Y_BB] + vpaddd Y_C,Y_C,[Y_CC] + vpaddd Y_D,Y_D,[Y_DD] + + vpaddd Y_A2,Y_A2,[Y_AA2] + vpaddd Y_B2,Y_B2,[Y_BB2] + vpaddd Y_C2,Y_C2,[Y_CC2] + vpaddd Y_D2,Y_D2,[Y_DD2] + + + + vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A + vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B + vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C + vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D + + + vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width + vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2 + vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2 + vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2 + + + ;;;;;;;;;;;;;;;; + ;; Postamble + + + + mov rsp, [rsp + _RSP_SAVE] + + ret + +section .data +align 64 +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 +ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm new file mode 100644 index 00000000..2c821285 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm @@ -0,0 +1,83 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define WRT_OPT wrt ..plt +%else +%define WRT_OPT +%endif + +%include "reg_sizes.asm" +%include "multibinary.asm" +default rel +[bits 64] + +; declare the L3 ctx level symbols (these will then call the appropriate +; L2 symbols) +extern md5_ctx_mgr_init_sse +extern md5_ctx_mgr_submit_sse +extern md5_ctx_mgr_flush_sse + +extern md5_ctx_mgr_init_avx +extern md5_ctx_mgr_submit_avx +extern md5_ctx_mgr_flush_avx + +extern md5_ctx_mgr_init_avx2 +extern md5_ctx_mgr_submit_avx2 +extern md5_ctx_mgr_flush_avx2 + +%ifdef HAVE_AS_KNOWS_AVX512 + extern md5_ctx_mgr_init_avx512 + extern md5_ctx_mgr_submit_avx512 + extern md5_ctx_mgr_flush_avx512 +%endif + +;;; *_mbinit are initial values for *_dispatched; is updated on first call. +;;; Therefore, *_dispatch_init is only executed on first call. + +; Initialise symbols +mbin_interface md5_ctx_mgr_init +mbin_interface md5_ctx_mgr_submit +mbin_interface md5_ctx_mgr_flush + +%ifdef HAVE_AS_KNOWS_AVX512 + ; Reuse mbin_dispatch_init6 through replacing base by sse version + mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512 + mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512 + mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512 +%else + mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2 + mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2 + mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2 +%endif + +;; func core, ver, snum +slversion md5_ctx_mgr_init, 00, 03, 0189 +slversion md5_ctx_mgr_submit, 00, 03, 018a +slversion md5_ctx_mgr_flush, 00, 03, 018b diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c new file mode 100644 index 00000000..9cb1fd64 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c @@ -0,0 +1,193 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdint.h> +#include <string.h> + +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// +// Reference MD5 Functions +//////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + +void md5_single(const uint8_t * data, uint32_t digest[4]); + +#define H0 0x67452301 +#define H1 0xefcdab89 +#define H2 0x98badcfe +#define H3 0x10325476 + +void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len) +{ + uint32_t i, j; + uint8_t buf[128]; + union { + uint64_t uint; + uint8_t uchar[8]; + } convert; + uint8_t *p; + + digest[0] = H0; + digest[1] = H1; + digest[2] = H2; + digest[3] = H3; + + i = len; + while (i >= 64) { + md5_single(input_data, digest); + input_data += 64; + i -= 64; + } + // 0 <= i < 64 + + memcpy(buf, input_data, i); + buf[i++] = 0x80; + for (j = i; j < 120; j++) + buf[j] = 0; + + if (i > 64 - 8) + i = 128; + else + i = 64; + + convert.uint = 8 * len; + p = buf + i - 8; + p[7] = convert.uchar[7]; + p[6] = convert.uchar[6]; + p[5] = convert.uchar[5]; + p[4] = convert.uchar[4]; + p[3] = convert.uchar[3]; + p[2] = convert.uchar[2]; + p[1] = convert.uchar[1]; + p[0] = convert.uchar[0]; + + md5_single(buf, digest); + if (i == 128) + md5_single(buf + 64, digest); +} + +#define F1(b,c,d) (d ^ (b & (c ^ d))) +#define F2(b,c,d) (c ^ (d & (b ^ c))) +#define F3(b,c,d) (b ^ c ^ d) +#define F4(b,c,d) (c ^ (b | ~d)) + +#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r)))) + +#define step(i,a,b,c,d,f,k,w,r) \ + if (i < 16) {f = F1(b,c,d); } else \ + if (i < 32) {f = F2(b,c,d); } else \ + if (i < 48) {f = F3(b,c,d); } else \ + {f = F4(b,c,d); } \ + f = a + f + k + w; \ + a = b + rol32(f, r); + +void md5_single(const uint8_t * data, uint32_t digest[4]) +{ + uint32_t a, b, c, d; + uint32_t f; + uint32_t *w = (uint32_t *) data; + + a = digest[0]; + b = digest[1]; + c = digest[2]; + d = digest[3]; + + step(0, a, b, c, d, f, 0xd76aa478, w[0], 7); + step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12); + step(2, c, d, a, b, f, 0x242070db, w[2], 17); + step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22); + step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7); + step(5, d, a, b, c, f, 0x4787c62a, w[5], 12); + step(6, c, d, a, b, f, 0xa8304613, w[6], 17); + step(7, b, c, d, a, f, 0xfd469501, w[7], 22); + step(8, a, b, c, d, f, 0x698098d8, w[8], 7); + step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12); + step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17); + step(11, b, c, d, a, f, 0x895cd7be, w[11], 22); + step(12, a, b, c, d, f, 0x6b901122, w[12], 7); + step(13, d, a, b, c, f, 0xfd987193, w[13], 12); + step(14, c, d, a, b, f, 0xa679438e, w[14], 17); + step(15, b, c, d, a, f, 0x49b40821, w[15], 22); + + step(16, a, b, c, d, f, 0xf61e2562, w[1], 5); + step(17, d, a, b, c, f, 0xc040b340, w[6], 9); + step(18, c, d, a, b, f, 0x265e5a51, w[11], 14); + step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20); + step(20, a, b, c, d, f, 0xd62f105d, w[5], 5); + step(21, d, a, b, c, f, 0x02441453, w[10], 9); + step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14); + step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20); + step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5); + step(25, d, a, b, c, f, 0xc33707d6, w[14], 9); + step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14); + step(27, b, c, d, a, f, 0x455a14ed, w[8], 20); + step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5); + step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9); + step(30, c, d, a, b, f, 0x676f02d9, w[7], 14); + step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20); + + step(32, a, b, c, d, f, 0xfffa3942, w[5], 4); + step(33, d, a, b, c, f, 0x8771f681, w[8], 11); + step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16); + step(35, b, c, d, a, f, 0xfde5380c, w[14], 23); + step(36, a, b, c, d, f, 0xa4beea44, w[1], 4); + step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11); + step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16); + step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23); + step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4); + step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11); + step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16); + step(43, b, c, d, a, f, 0x04881d05, w[6], 23); + step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4); + step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11); + step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16); + step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23); + + step(48, a, b, c, d, f, 0xf4292244, w[0], 6); + step(49, d, a, b, c, f, 0x432aff97, w[7], 10); + step(50, c, d, a, b, f, 0xab9423a7, w[14], 15); + step(51, b, c, d, a, f, 0xfc93a039, w[5], 21); + step(52, a, b, c, d, f, 0x655b59c3, w[12], 6); + step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10); + step(54, c, d, a, b, f, 0xffeff47d, w[10], 15); + step(55, b, c, d, a, f, 0x85845dd1, w[1], 21); + step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6); + step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10); + step(58, c, d, a, b, f, 0xa3014314, w[6], 15); + step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21); + step(60, a, b, c, d, f, 0xf7537e82, w[4], 6); + step(61, d, a, b, c, f, 0xbd3af235, w[11], 10); + step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15); + step(63, b, c, d, a, f, 0xeb86d391, w[9], 21); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; +} |