diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/x86/crypto/sha1-mb | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/crypto/sha1-mb')
-rw-r--r-- | arch/x86/crypto/sha1-mb/Makefile | 14 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb.c | 1011 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_ctx.h | 134 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_mgr.h | 110 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S | 287 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S | 304 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c | 64 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S | 209 | ||||
-rw-r--r-- | arch/x86/crypto/sha1-mb/sha1_x8_avx2.S | 492 |
9 files changed, 2625 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile new file mode 100644 index 000000000..815ded3ba --- /dev/null +++ b/arch/x86/crypto/sha1-mb/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Arch-specific CryptoAPI modules. +# + +OBJECT_FILES_NON_STANDARD := y + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o + sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \ + sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o +endif diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c new file mode 100644 index 000000000..b93805664 --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -0,0 +1,1011 @@ +/* + * Multi buffer SHA1 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <linux/hardirq.h> +#include <asm/fpu/api.h> +#include "sha1_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha1_mb_alg_state; + +struct sha1_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx + *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) +{ + struct ahash_request *areq; + + areq = container_of((void *) hash_ctx, struct ahash_request, __ctx); + return container_of(areq, struct mcryptd_hash_request_ctx, areq); +} + +static inline struct ahash_request + *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct ahash_request *areq) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_submit) + (struct sha1_mb_mgr *state, struct job_sha1 *job); +static asmlinkage struct job_sha1* (*sha1_job_mgr_flush) + (struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job) + (struct sha1_mb_mgr *state); + +static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], + uint64_t total_len) +{ + uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA1_BLOCK_SIZE - 1) & + (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA1_LOG2_BLOCK_SIZE; +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, + struct sha1_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA1_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA1_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr, + &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = + sha1_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha1_hash_ctx + *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to + * the user. + * If it is not ready, resubmit the job to finish processing. + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr + * still need processing. + */ + struct sha1_hash_ctx *ctx; + + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr); + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr) +{ + sha1_job_mgr_init(&mgr->mgr); +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, + struct sha1_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & ~(HASH_UPDATE | HASH_LAST)) { + /* User should not pass anything other than UPDATE or LAST */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + /* + * If we made it here, there were no errors during this call to + * submit + */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* + * Store the user's request flags and mark this ctx as currently + * being processed. + */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) { + /* + * Compute how many bytes to copy from user buffer into + * extra block + */ + uint32_t copy_len = SHA1_BLOCK_SIZE - + ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *) + ((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* + * The extra block should never contain more than 1 block + * here + */ + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + /* + * If the extra block buffer contains exactly 1 block, it can + * be hashed. + */ + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha1_hash_ctx *) + sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr) +{ + struct sha1_hash_ctx *ctx; + + while (1) { + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish + * processing. + */ + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be + * returned. Otherwise, all jobs currently being managed by the + * sha1_ctx_mgr still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha1_mb_init(struct ahash_request *areq) +{ + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA1_H0; + sctx->job.result_digest[1] = SHA1_H1; + sctx->job.result_digest[2] = SHA1_H2; + sctx->job.result_digest[3] = SHA1_H3; + sctx->job.result_digest[4] = SHA1_H4; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq); + __be32 *dst = (__be32 *) rctx->out; + + for (i = 0; i < 5; ++i) + dst[i] = cpu_to_be32(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha1_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha1_hash_ctx *) + ahash_request_ctx(&rctx->areq); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, + rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha1_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha1_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + req_ctx->complete(&req->base, ret); + else { + local_bh_disable(); + req_ctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha1_mb_update(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); + sha1_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_finup(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); + sha1_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, + nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_final(struct ahash_request *areq) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(areq, struct mcryptd_hash_request_ctx, areq); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct sha1_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, areq); + + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq); + /* flag HASH_FINAL and 0 data size */ + sha1_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, + HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_export(struct ahash_request *areq, void *out) +{ + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_mb_import(struct ahash_request *areq, const void *in) +{ + struct sha1_hash_ctx *sctx = ahash_request_ctx(areq); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha1_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm) +{ + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + sizeof(struct sha1_hash_ctx)); + + return 0; +} + +static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha1_mb_areq_alg = { + .init = sha1_mb_init, + .update = sha1_mb_update, + .final = sha1_mb_final, + .finup = sha1_mb_finup, + .export = sha1_mb_export, + .import = sha1_mb_import, + .halg = { + .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_hash_ctx), + .base = { + .cra_name = "__sha1-mb", + .cra_driver_name = "__intel_sha1-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread + * sleep + */ + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT + (sha1_mb_areq_alg.halg.base.cra_list), + .cra_init = sha1_mb_areq_init_tfm, + .cra_exit = sha1_mb_areq_exit_tfm, + .cra_ctxsize = sizeof(struct sha1_hash_ctx), + } + } +}; + +static int sha1_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha1_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha1_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha1_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha1_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha1_mb_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_export(mcryptd_req, out); +} + +static int sha1_mb_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm); + struct mcryptd_hash_request_ctx *rctx; + struct ahash_request *areq; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + rctx = ahash_request_ctx(mcryptd_req); + areq = &rctx->areq; + + ahash_request_set_tfm(areq, child); + ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP, + rctx->complete, req); + + return crypto_ahash_import(mcryptd_req, in); +} + +static struct ahash_alg sha1_mb_async_alg = { + .init = sha1_mb_async_init, + .update = sha1_mb_async_update, + .final = sha1_mb_async_final, + .finup = sha1_mb_async_finup, + .digest = sha1_mb_async_digest, + .export = sha1_mb_async_export, + .import = sha1_mb_async_import, + .halg = { + .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_hash_ctx), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1_mb", + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list), + .cra_init = sha1_mb_async_init_tfm, + .cra_exit = sha1_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha1_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha1_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if (time_before(cur_time, rctx->tag.expire)) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha1_hash_ctx *) + sha1_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha1_mb error: nothing got flushed for non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha1_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate); + + sha1_job_mgr_init = sha1_mb_mgr_init_avx2; + sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2; + sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2; + sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2; + + if (!sha1_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha1_mb_alg_state; + cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr), + GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha1_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha1_mb_alg_state.flusher = &sha1_mb_flusher; + + err = crypto_register_ahash(&sha1_mb_areq_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha1_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_ahash(&sha1_mb_areq_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha1_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha1_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha1_mb_async_alg); + crypto_unregister_ahash(&sha1_mb_areq_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha1_mb_alg_state.alg_cstate); +} + +module_init(sha1_mb_mod_init); +module_exit(sha1_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h new file mode 100644 index 000000000..9454bd16f --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -0,0 +1,134 @@ +/* + * Header file for multi buffer SHA context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha1_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_LAST 0x01 +#define HASH_DONE 0x02 +#define HASH_FINAL 0x04 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, + +#ifdef HASH_CTX_DEBUG + HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4, +#endif +}; + + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + + +/* Hash Constants and Typedefs */ +#define SHA1_DIGEST_LENGTH 5 +#define SHA1_LOG2_BLOCK_SIZE 6 + +#define SHA1_PADLENGTHFIELD_SIZE 8 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha1_ctx_mgr { + struct sha1_mb_mgr mgr; +}; + +/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */ + +struct sha1_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha1 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint64_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h new file mode 100644 index 000000000..08ad1a9ac --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h @@ -0,0 +1,110 @@ +/* + * Header file for multi buffer SHA1 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + + +#include <linux/types.h> + +#define NUM_SHA1_DIGEST_WORDS 5 + +enum job_sts { STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha1 { + u8 *buffer; + u32 len; + u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +/* SHA1 out-of-order scheduler */ + +/* typedef uint32_t sha1_digest_array[5][8]; */ + +struct sha1_args_x8 { + uint32_t digest[5][8]; + uint8_t *data_ptr[8]; +}; + +struct sha1_lane_data { + struct job_sha1 *job_in_lane; +}; + +struct sha1_mb_mgr { + struct sha1_args_x8 args; + + uint32_t lens[8]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha1_lane_data ldata[8]; +}; + + +#define SHA1_MB_MGR_NUM_LANES_AVX2 8 + +void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state); +struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state, + struct job_sha1 *job); +struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state); +struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state); + +#endif diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S new file mode 100644 index 000000000..86688c6e7 --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S @@ -0,0 +1,287 @@ +/* + * Header file for multi buffer SHA1 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_ +#define _SHA1_MB_MGR_DATASTRUCT_ASM_ + +## START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +## FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +## END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +######################################################################## + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - %%tmp) +.if (tmp > 0) + .lcomm tmp +.endif +.endstruc +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + + + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + + +#endif + +######################################################################## +#### Define constants +######################################################################## + +######################################################################## +#### Define SHA1 Out Of Order Data Structures +######################################################################## + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + +_LANE_DATA_size = _FIELD_OFFSET +_LANE_DATA_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # SHA1_ARGS_X8 +### name size align +FIELD _digest, 4*5*8, 16 # transposed digest +FIELD _data_ptr, 8*8, 8 # array of pointers to data +END_FIELDS + +_SHA1_ARGS_X4_size = _FIELD_OFFSET +_SHA1_ARGS_X4_align = _STRUCT_ALIGN +_SHA1_ARGS_X8_size = _FIELD_OFFSET +_SHA1_ARGS_X8_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align +FIELD _lens, 4*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +END_FIELDS + +_MB_MGR_size = _FIELD_OFFSET +_MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + + +######################################################################## +#### Define constants +######################################################################## + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +######################################################################## +#### Define JOB_SHA1 structure +######################################################################## + +START_FIELDS # JOB_SHA1 + +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 4, 4 # length in bytes +FIELD _result_digest, 5*4, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +_JOB_SHA1_size = _FIELD_OFFSET +_JOB_SHA1_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S new file mode 100644 index 000000000..7cfba738f --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S @@ -0,0 +1,304 @@ +/* + * Flush routine for SHA1 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha1_mb_mgr_datastruct.S" + + +.extern sha1_x8_avx2 + +# LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +# idx must be a register not clobbered by sha1_x8_avx2 +#define idx %r8 +#define DWORD_idx %r8d + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx +#define tmp2_w %ebx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 %arg1 + +#define extra_blocks %arg2 +#define p %arg2 + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha1_mb_mgr_flush_avx2) + FRAME_BEGIN + push %rbx + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+3, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne four(%rip), idx + offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne five(%rip), idx + offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne six(%rip), idx + offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne seven(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 8 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 4*I) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha1_x8_avx2 + # state and idx are intact + + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state , idx, 4) , %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + mov tmp2_w, offset(job_rax) + +return: + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha1_mb_mgr_flush_avx2) + + +################################################################# + +.align 16 +ENTRY(sha1_mb_mgr_get_comp_job_avx2) + push %rbx + + ## if bit 32+3 is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $(32+3), unused_lanes + jc .return_null + + # Find min length + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + test $~0xF, idx + jnz .return_null + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + + vmovdqu %xmm0, _result_digest(job_rax) + movl tmp2_w, _result_digest+1*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha1_mb_mgr_get_comp_job_avx2) + +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 +.align 16 +clear_low_nibble: +.octa 0x000000000000000000000000FFFFFFF0 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 +four: +.quad 4 +five: +.quad 5 +six: +.quad 6 +seven: +.quad 7 diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c new file mode 100644 index 000000000..d2add0d35 --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c @@ -0,0 +1,64 @@ +/* + * Initialization code for multi buffer SHA1 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha1_mb_mgr.h" + +void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210ULL; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = NULL; + } +} diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S new file mode 100644 index 000000000..7a93b1c0d --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S @@ -0,0 +1,209 @@ +/* + * Buffer submit code for multi buffer SHA1 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> +#include "sha1_mb_mgr_datastruct.S" + + +.extern sha1_x8_avx + +# LINUX register definitions +arg1 = %rdi +arg2 = %rsi +size_offset = %rcx +tmp2 = %rcx +extra_blocks = %rdx + +# Common definitions +#define state arg1 +#define job %rsi +#define len2 arg2 +#define p2 arg2 + +# idx must be a register not clobberred by sha1_x8_avx2 +idx = %r8 +DWORD_idx = %r8d +last_len = %r8 + +p = %r11 +start_offset = %r11 + +unused_lanes = %rbx +BYTE_unused_lanes = %bl + +job_rax = %rax +len = %rax +DWORD_len = %eax + +lane = %r12 +tmp3 = %r12 + +tmp = %r9 +DWORD_tmp = %r9d + +lane_data = %r10 + +# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha1_mb_mgr_submit_avx2) + FRAME_BEGIN + push %rbx + push %r12 + + mov _unused_lanes(state), unused_lanes + mov unused_lanes, lane + and $0xF, lane + shr $4, unused_lanes + imul $_LANE_DATA_size, lane, lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + shl $4, len + or lane, len + + movl DWORD_len, _lens(state , lane, 4) + + # Load digest words from result_digest + vmovdqu _result_digest(job), %xmm0 + mov _result_digest+1*16(job), DWORD_tmp + vmovd %xmm0, _args_digest(state, lane, 4) + vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4) + vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4) + vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4) + movl DWORD_tmp, _args_digest+4*32(state , lane, 4) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xF, unused_lanes + jne return_null + +start_loop: + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens + 0*16(state) + vmovdqa %xmm1, _lens + 1*16(state) + + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha1_x8_avx2 + + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), DWORD_tmp + + vmovdqu %xmm0, _result_digest(job_rax) + movl DWORD_tmp, _result_digest+1*16(job_rax) + +return: + pop %r12 + pop %rbx + FRAME_END + ret + +return_null: + xor job_rax, job_rax + jmp return + +ENDPROC(sha1_mb_mgr_submit_avx2) + +.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16 +.align 16 +clear_low_nibble: + .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S new file mode 100644 index 000000000..20f77aa63 --- /dev/null +++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S @@ -0,0 +1,492 @@ +/* + * Multi-buffer SHA1 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include "sha1_mb_mgr_datastruct.S" + +## code to compute oct SHA1 using SSE-256 +## outer calling routine takes care of save and restore of XMM registers + +## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15 +## +## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +## Linux preserves: rdi rbp r8 +## +## clobbers ymm0-15 + + +# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +# "transpose" data in {r0...r7} using temps {t0...t1} +# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +# +# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +# + +.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 + # process top half (r0..r3) {a...d} + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + # use r2 in place of t0 + # process bottom half (r4..r7) {e...h} + vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 + vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 + vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 + vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 + vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 + vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 + vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 + vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 + +.endm +## +## Magic functions defined in FIPS 180-1 +## +# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D))) +.macro MAGIC_F0 regF regB regC regD regT + vpxor \regD, \regC, \regF + vpand \regB, \regF, \regF + vpxor \regD, \regF, \regF +.endm + +# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D) +.macro MAGIC_F1 regF regB regC regD regT + vpxor \regC, \regD, \regF + vpxor \regB, \regF, \regF +.endm + +# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D)) +.macro MAGIC_F2 regF regB regC regD regT + vpor \regC, \regB, \regF + vpand \regC, \regB, \regT + vpand \regD, \regF, \regF + vpor \regT, \regF, \regF +.endm + +# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D) +.macro MAGIC_F3 regF regB regC regD regT + MAGIC_F1 \regF,\regB,\regC,\regD,\regT +.endm + +# PROLD reg, imm, tmp +.macro PROLD reg imm tmp + vpsrld $(32-\imm), \reg, \tmp + vpslld $\imm, \reg, \reg + vpor \tmp, \reg, \reg +.endm + +.macro PROLD_nd reg imm tmp src + vpsrld $(32-\imm), \src, \tmp + vpslld $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC + vpaddd \immCNT, \regE, \regE + vpaddd \memW*32(%rsp), \regE, \regE + PROLD_nd \regT, 5, \regF, \regA + vpaddd \regT, \regE, \regE + \MAGIC \regF, \regB, \regC, \regD, \regT + PROLD \regB, 30, \regT + vpaddd \regF, \regE, \regE +.endm + +.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC + vpaddd \immCNT, \regE, \regE + offset = ((\memW - 14) & 15) * 32 + vmovdqu offset(%rsp), W14 + vpxor W14, W16, W16 + offset = ((\memW - 8) & 15) * 32 + vpxor offset(%rsp), W16, W16 + offset = ((\memW - 3) & 15) * 32 + vpxor offset(%rsp), W16, W16 + vpsrld $(32-1), W16, \regF + vpslld $1, W16, W16 + vpor W16, \regF, \regF + + ROTATE_W + + offset = ((\memW - 0) & 15) * 32 + vmovdqu \regF, offset(%rsp) + vpaddd \regF, \regE, \regE + PROLD_nd \regT, 5, \regF, \regA + vpaddd \regT, \regE, \regE + \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D) + PROLD \regB,30, \regT + vpaddd \regF, \regE, \regE +.endm + +######################################################################## +######################################################################## +######################################################################## + +## FRAMESZ plus pushes must be an odd multiple of 8 +YMM_SAVE = (15-15)*32 +FRAMESZ = 32*16 + YMM_SAVE +_YMM = FRAMESZ - YMM_SAVE + +#define VMOVPS vmovups + +IDX = %rax +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 +inp4 = %r13 +inp5 = %r14 +inp6 = %r15 +inp7 = %rcx +arg1 = %rdi +arg2 = %rsi +RSP_SAVE = %rdx + +# ymm0 A +# ymm1 B +# ymm2 C +# ymm3 D +# ymm4 E +# ymm5 F AA +# ymm6 T0 BB +# ymm7 T1 CC +# ymm8 T2 DD +# ymm9 T3 EE +# ymm10 T4 TMP +# ymm11 T5 FUN +# ymm12 T6 K +# ymm13 T7 W14 +# ymm14 T8 W15 +# ymm15 T9 W16 + + +A = %ymm0 +B = %ymm1 +C = %ymm2 +D = %ymm3 +E = %ymm4 +F = %ymm5 +T0 = %ymm6 +T1 = %ymm7 +T2 = %ymm8 +T3 = %ymm9 +T4 = %ymm10 +T5 = %ymm11 +T6 = %ymm12 +T7 = %ymm13 +T8 = %ymm14 +T9 = %ymm15 + +AA = %ymm5 +BB = %ymm6 +CC = %ymm7 +DD = %ymm8 +EE = %ymm9 +TMP = %ymm10 +FUN = %ymm11 +K = %ymm12 +W14 = %ymm13 +W15 = %ymm14 +W16 = %ymm15 + +.macro ROTATE_ARGS + TMP_ = E + E = D + D = C + C = B + B = A + A = TMP_ +.endm + +.macro ROTATE_W +TMP_ = W16 +W16 = W15 +W15 = W14 +W14 = TMP_ +.endm + +# 8 streams x 5 32bit words per digest x 4 bytes per word +#define DIGEST_SIZE (8*5*4) + +.align 32 + +# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) +# arg 1 : pointer to array[4] of pointer to input data +# arg 2 : size (in blocks) ;; assumed to be >= 1 +# +ENTRY(sha1_x8_avx2) + + # save callee-saved clobbered registers to comply with C function ABI + push %r12 + push %r13 + push %r14 + push %r15 + + #save rsp + mov %rsp, RSP_SAVE + sub $FRAMESZ, %rsp + + #align rsp to 32 Bytes + and $~0x1F, %rsp + + ## Initialize digests + vmovdqu 0*32(arg1), A + vmovdqu 1*32(arg1), B + vmovdqu 2*32(arg1), C + vmovdqu 3*32(arg1), D + vmovdqu 4*32(arg1), E + + ## transpose input onto stack + mov _data_ptr+0*8(arg1),inp0 + mov _data_ptr+1*8(arg1),inp1 + mov _data_ptr+2*8(arg1),inp2 + mov _data_ptr+3*8(arg1),inp3 + mov _data_ptr+4*8(arg1),inp4 + mov _data_ptr+5*8(arg1),inp5 + mov _data_ptr+6*8(arg1),inp6 + mov _data_ptr+7*8(arg1),inp7 + + xor IDX, IDX +lloop: + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F + I=0 +.rep 2 + VMOVPS (inp0, IDX), T0 + VMOVPS (inp1, IDX), T1 + VMOVPS (inp2, IDX), T2 + VMOVPS (inp3, IDX), T3 + VMOVPS (inp4, IDX), T4 + VMOVPS (inp5, IDX), T5 + VMOVPS (inp6, IDX), T6 + VMOVPS (inp7, IDX), T7 + + TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + vpshufb F, T0, T0 + vmovdqu T0, (I*8)*32(%rsp) + vpshufb F, T1, T1 + vmovdqu T1, (I*8+1)*32(%rsp) + vpshufb F, T2, T2 + vmovdqu T2, (I*8+2)*32(%rsp) + vpshufb F, T3, T3 + vmovdqu T3, (I*8+3)*32(%rsp) + vpshufb F, T4, T4 + vmovdqu T4, (I*8+4)*32(%rsp) + vpshufb F, T5, T5 + vmovdqu T5, (I*8+5)*32(%rsp) + vpshufb F, T6, T6 + vmovdqu T6, (I*8+6)*32(%rsp) + vpshufb F, T7, T7 + vmovdqu T7, (I*8+7)*32(%rsp) + add $32, IDX + I = (I+1) +.endr + # save old digests + vmovdqu A,AA + vmovdqu B,BB + vmovdqu C,CC + vmovdqu D,DD + vmovdqu E,EE + +## +## perform 0-79 steps +## + vmovdqu K00_19(%rip), K +## do rounds 0...15 + I = 0 +.rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 16...19 + vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16 + vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15 +.rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 20...39 + vmovdqu K20_39(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 40...59 + vmovdqu K40_59(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 60...79 + vmovdqu K60_79(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS + I = (I+1) +.endr + + vpaddd AA,A,A + vpaddd BB,B,B + vpaddd CC,C,C + vpaddd DD,D,D + vpaddd EE,E,E + + sub $1, arg2 + jne lloop + + # write out digests + vmovdqu A, 0*32(arg1) + vmovdqu B, 1*32(arg1) + vmovdqu C, 2*32(arg1) + vmovdqu D, 3*32(arg1) + vmovdqu E, 4*32(arg1) + + # update input pointers + add IDX, inp0 + add IDX, inp1 + add IDX, inp2 + add IDX, inp3 + add IDX, inp4 + add IDX, inp5 + add IDX, inp6 + add IDX, inp7 + mov inp0, _data_ptr (arg1) + mov inp1, _data_ptr + 1*8(arg1) + mov inp2, _data_ptr + 2*8(arg1) + mov inp3, _data_ptr + 3*8(arg1) + mov inp4, _data_ptr + 4*8(arg1) + mov inp5, _data_ptr + 5*8(arg1) + mov inp6, _data_ptr + 6*8(arg1) + mov inp7, _data_ptr + 7*8(arg1) + + ################ + ## Postamble + + mov RSP_SAVE, %rsp + + # restore callee-saved clobbered registers + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret +ENDPROC(sha1_x8_avx2) + + +.section .rodata.cst32.K00_19, "aM", @progbits, 32 +.align 32 +K00_19: +.octa 0x5A8279995A8279995A8279995A827999 +.octa 0x5A8279995A8279995A8279995A827999 + +.section .rodata.cst32.K20_39, "aM", @progbits, 32 +.align 32 +K20_39: +.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 +.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + +.section .rodata.cst32.K40_59, "aM", @progbits, 32 +.align 32 +K40_59: +.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC +.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + +.section .rodata.cst32.K60_79, "aM", @progbits, 32 +.align 32 +K60_79: +.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 +.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +PSHUFFLE_BYTE_FLIP_MASK: +.octa 0x0c0d0e0f08090a0b0405060700010203 +.octa 0x0c0d0e0f08090a0b0405060700010203 |