summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/sha1-mb
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/x86/crypto/sha1-mb
parentInitial commit. (diff)
downloadlinux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/crypto/sha1-mb')
-rw-r--r--arch/x86/crypto/sha1-mb/Makefile14
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb.c1011
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_ctx.h134
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr.h110
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S287
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S304
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c64
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S209
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_x8_avx2.S492
9 files changed, 2625 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
new file mode 100644
index 000000000..815ded3ba
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
+ sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
+ sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
new file mode 100644
index 000000000..b93805664
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -0,0 +1,1011 @@
+/*
+ * Multi buffer SHA1 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha1_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha1_mb_alg_state;
+
+struct sha1_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+ *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+{
+ struct ahash_request *areq;
+
+ areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+ return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+ *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct ahash_request *areq)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
+ (struct sha1_mb_mgr *state, struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
+ (struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
+ (struct sha1_mb_mgr *state);
+
+static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
+ uint64_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA1_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA1_LOG2_BLOCK_SIZE;
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
+ struct sha1_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA1_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks =
+ sha1_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha1_hash_ctx *)
+ sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha1_hash_ctx
+ *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to
+ * the user.
+ * If it is not ready, resubmit the job to finish processing.
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+ * still need processing.
+ */
+ struct sha1_hash_ctx *ctx;
+
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
+{
+ sha1_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
+ struct sha1_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ if (flags & ~(HASH_UPDATE | HASH_LAST)) {
+ /* User should not pass anything other than UPDATE or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ /*
+ * If we made it here, there were no errors during this call to
+ * submit
+ */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /*
+ * Store the user's request flags and mark this ctx as currently
+ * being processed.
+ */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) {
+ /*
+ * Compute how many bytes to copy from user buffer into
+ * extra block
+ */
+ uint32_t copy_len = SHA1_BLOCK_SIZE -
+ ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)
+ ((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /*
+ * The extra block should never contain more than 1 block
+ * here
+ */
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ /*
+ * If the extra block buffer contains exactly 1 block, it can
+ * be hashed.
+ */
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha1_hash_ctx *)
+ sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
+{
+ struct sha1_hash_ctx *ctx;
+
+ while (1) {
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ return NULL;
+
+ /*
+ * If flush returned a job, resubmit the job to finish
+ * processing.
+ */
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
+ * returned. Otherwise, all jobs currently being managed by the
+ * sha1_ctx_mgr still need processing. Loop.
+ */
+ if (ctx)
+ return ctx;
+ }
+}
+
+static int sha1_mb_init(struct ahash_request *areq)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA1_H0;
+ sctx->job.result_digest[1] = SHA1_H1;
+ sctx->job.result_digest[2] = SHA1_H2;
+ sctx->job.result_digest[3] = SHA1_H3;
+ sctx->job.result_digest[4] = SHA1_H4;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+ __be32 *dst = (__be32 *) rctx->out;
+
+ for (i = 0; i < 5; ++i)
+ dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha1_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha1_hash_ctx *)
+ ahash_request_ctx(&rctx->areq);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
+ rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha1_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+
+ /* remove from work list */
+ spin_lock(&cstate->work_lock);
+ list_del(&rctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock(&cstate->work_lock);
+ list_del(&req_ctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ req_ctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ req_ctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ }
+
+ return 0;
+}
+
+static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock(&cstate->work_lock);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock(&cstate->work_lock);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha1_mb_update(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_finup(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ sha1_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_final(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ /* flag HASH_FINAL and 0 data size */
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+ HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_export(struct ahash_request *areq, void *out)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_mb_import(struct ahash_request *areq, const void *in)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha1_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ sizeof(struct sha1_hash_ctx));
+
+ return 0;
+}
+
+static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_areq_alg = {
+ .init = sha1_mb_init,
+ .update = sha1_mb_update,
+ .final = sha1_mb_final,
+ .finup = sha1_mb_finup,
+ .export = sha1_mb_export,
+ .import = sha1_mb_import,
+ .halg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
+ .base = {
+ .cra_name = "__sha1-mb",
+ .cra_driver_name = "__intel_sha1-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread
+ * sleep
+ */
+ .cra_flags = CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha1_mb_areq_alg.halg.base.cra_list),
+ .cra_init = sha1_mb_areq_init_tfm,
+ .cra_exit = sha1_mb_areq_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha1_hash_ctx),
+ }
+ }
+};
+
+static int sha1_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha1_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha1_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha1_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha1_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct ahash_request *areq;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ areq = &rctx->areq;
+
+ ahash_request_set_tfm(areq, child);
+ ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+ rctx->complete, req);
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha1_mb_async_alg = {
+ .init = sha1_mb_async_init,
+ .update = sha1_mb_async_update,
+ .final = sha1_mb_async_final,
+ .finup = sha1_mb_async_finup,
+ .digest = sha1_mb_async_digest,
+ .export = sha1_mb_async_export,
+ .import = sha1_mb_async_import,
+ .halg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1_mb",
+ /*
+ * Low priority, since with few concurrent hash requests
+ * this is extremely slow due to the flush delay. Users
+ * whose workloads would benefit from this can request
+ * it explicitly by driver name, or can increase its
+ * priority at runtime using NETLINK_CRYPTO.
+ */
+ .cra_priority = 50,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha1_mb_async_init_tfm,
+ .cra_exit = sha1_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha1_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha1_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if (time_before(cur_time, rctx->tag.expire))
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha1_hash_ctx *)
+ sha1_ctx_mgr_flush(cstate->mgr);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha1_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
+
+ sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
+ sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
+ sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
+ sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
+
+ if (!sha1_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha1_mb_alg_state;
+ cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+ GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha1_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha1_mb_alg_state.flusher = &sha1_mb_flusher;
+
+ err = crypto_register_ahash(&sha1_mb_areq_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha1_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_ahash(&sha1_mb_areq_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha1_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha1_mb_async_alg);
+ crypto_unregister_ahash(&sha1_mb_areq_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+}
+
+module_init(sha1_mb_mod_init);
+module_exit(sha1_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
new file mode 100644
index 000000000..9454bd16f
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for multi buffer SHA context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha1_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_LAST 0x01
+#define HASH_DONE 0x02
+#define HASH_FINAL 0x04
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+
+#ifdef HASH_CTX_DEBUG
+ HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA1_DIGEST_LENGTH 5
+#define SHA1_LOG2_BLOCK_SIZE 6
+
+#define SHA1_PADLENGTHFIELD_SIZE 8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha1_ctx_mgr {
+ struct sha1_mb_mgr mgr;
+};
+
+/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
+
+struct sha1_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha1 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint64_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
new file mode 100644
index 000000000..08ad1a9ac
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
@@ -0,0 +1,110 @@
+/*
+ * Header file for multi buffer SHA1 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+
+#include <linux/types.h>
+
+#define NUM_SHA1_DIGEST_WORDS 5
+
+enum job_sts { STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha1 {
+ u8 *buffer;
+ u32 len;
+ u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+/* SHA1 out-of-order scheduler */
+
+/* typedef uint32_t sha1_digest_array[5][8]; */
+
+struct sha1_args_x8 {
+ uint32_t digest[5][8];
+ uint8_t *data_ptr[8];
+};
+
+struct sha1_lane_data {
+ struct job_sha1 *job_in_lane;
+};
+
+struct sha1_mb_mgr {
+ struct sha1_args_x8 args;
+
+ uint32_t lens[8];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha1_lane_data ldata[8];
+};
+
+
+#define SHA1_MB_MGR_NUM_LANES_AVX2 8
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
+ struct job_sha1 *job);
+struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
new file mode 100644
index 000000000..86688c6e7
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
@@ -0,0 +1,287 @@
+/*
+ * Header file for multi buffer SHA1 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
+#define _SHA1_MB_MGR_DATASTRUCT_ASM_
+
+## START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+## FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+## END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+
+#endif
+
+########################################################################
+#### Define constants
+########################################################################
+
+########################################################################
+#### Define SHA1 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+_LANE_DATA_size = _FIELD_OFFSET
+_LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # SHA1_ARGS_X8
+### name size align
+FIELD _digest, 4*5*8, 16 # transposed digest
+FIELD _data_ptr, 8*8, 8 # array of pointers to data
+END_FIELDS
+
+_SHA1_ARGS_X4_size = _FIELD_OFFSET
+_SHA1_ARGS_X4_align = _STRUCT_ALIGN
+_SHA1_ARGS_X8_size = _FIELD_OFFSET
+_SHA1_ARGS_X8_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+_MB_MGR_size = _FIELD_OFFSET
+_MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+########################################################################
+#### Define JOB_SHA1 structure
+########################################################################
+
+START_FIELDS # JOB_SHA1
+
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 4, 4 # length in bytes
+FIELD _result_digest, 5*4, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+_JOB_SHA1_size = _FIELD_OFFSET
+_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000..7cfba738f
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,304 @@
+/*
+ * Flush routine for SHA1 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx2
+
+# LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# Common definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+# idx must be a register not clobbered by sha1_x8_avx2
+#define idx %r8
+#define DWORD_idx %r8d
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+#define tmp2_w %ebx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 %arg1
+
+#define extra_blocks %arg2
+#define p %arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha1_mb_mgr_flush_avx2)
+ FRAME_BEGIN
+ push %rbx
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+3, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+ offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne four(%rip), idx
+ offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne five(%rip), idx
+ offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne six(%rip), idx
+ offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne seven(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 8
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 4*I)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ vmovdqu _lens+0*16(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqu %xmm0, _lens+0*16(state)
+ vmovdqu %xmm1, _lens+1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+ # state and idx are intact
+
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state , idx, 4) , %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ mov tmp2_w, offset(job_rax)
+
+return:
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha1_mb_mgr_flush_avx2)
+
+
+#################################################################
+
+.align 16
+ENTRY(sha1_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ ## if bit 32+3 is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+3), unused_lanes
+ jc .return_null
+
+ # Find min length
+ vmovdqu _lens(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ test $~0xF, idx
+ jnz .return_null
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl tmp2_w, _result_digest+1*16(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+.octa 0x000000000000000000000000FFFFFFF0
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+one:
+.quad 1
+two:
+.quad 2
+three:
+.quad 3
+four:
+.quad 4
+five:
+.quad 5
+six:
+.quad 6
+seven:
+.quad 7
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..d2add0d35
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Initialization code for multi buffer SHA1 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha1_mb_mgr.h"
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210ULL;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = NULL;
+ }
+}
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000..7a93b1c0d
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,209 @@
+/*
+ * Buffer submit code for multi buffer SHA1 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx
+
+# LINUX register definitions
+arg1 = %rdi
+arg2 = %rsi
+size_offset = %rcx
+tmp2 = %rcx
+extra_blocks = %rdx
+
+# Common definitions
+#define state arg1
+#define job %rsi
+#define len2 arg2
+#define p2 arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx = %r8
+DWORD_idx = %r8d
+last_len = %r8
+
+p = %r11
+start_offset = %r11
+
+unused_lanes = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax = %rax
+len = %rax
+DWORD_len = %eax
+
+lane = %r12
+tmp3 = %r12
+
+tmp = %r9
+DWORD_tmp = %r9d
+
+lane_data = %r10
+
+# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha1_mb_mgr_submit_avx2)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
+
+ mov _unused_lanes(state), unused_lanes
+ mov unused_lanes, lane
+ and $0xF, lane
+ shr $4, unused_lanes
+ imul $_LANE_DATA_size, lane, lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ shl $4, len
+ or lane, len
+
+ movl DWORD_len, _lens(state , lane, 4)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest(job), %xmm0
+ mov _result_digest+1*16(job), DWORD_tmp
+ vmovd %xmm0, _args_digest(state, lane, 4)
+ vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
+ vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
+ vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
+ movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xF, unused_lanes
+ jne return_null
+
+start_loop:
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens + 0*16(state)
+ vmovdqa %xmm1, _lens + 1*16(state)
+
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), DWORD_tmp
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl DWORD_tmp, _result_digest+1*16(job_rax)
+
+return:
+ pop %r12
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ENDPROC(sha1_mb_mgr_submit_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+ .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
new file mode 100644
index 000000000..20f77aa63
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -0,0 +1,492 @@
+/*
+ * Multi-buffer SHA1 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+## code to compute oct SHA1 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+
+## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
+##
+## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+## Linux preserves: rdi rbp r8
+##
+## clobbers ymm0-15
+
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+ # process top half (r0..r3) {a...d}
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ # use r2 in place of t0
+ # process bottom half (r4..r7) {e...h}
+ vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
+ vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
+ vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
+ vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
+ vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
+ vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
+ vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
+ vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
+
+.endm
+##
+## Magic functions defined in FIPS 180-1
+##
+# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
+.macro MAGIC_F0 regF regB regC regD regT
+ vpxor \regD, \regC, \regF
+ vpand \regB, \regF, \regF
+ vpxor \regD, \regF, \regF
+.endm
+
+# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F1 regF regB regC regD regT
+ vpxor \regC, \regD, \regF
+ vpxor \regB, \regF, \regF
+.endm
+
+# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
+.macro MAGIC_F2 regF regB regC regD regT
+ vpor \regC, \regB, \regF
+ vpand \regC, \regB, \regT
+ vpand \regD, \regF, \regF
+ vpor \regT, \regF, \regF
+.endm
+
+# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F3 regF regB regC regD regT
+ MAGIC_F1 \regF,\regB,\regC,\regD,\regT
+.endm
+
+# PROLD reg, imm, tmp
+.macro PROLD reg imm tmp
+ vpsrld $(32-\imm), \reg, \tmp
+ vpslld $\imm, \reg, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro PROLD_nd reg imm tmp src
+ vpsrld $(32-\imm), \src, \tmp
+ vpslld $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ vpaddd \memW*32(%rsp), \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF, \regB, \regC, \regD, \regT
+ PROLD \regB, 30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ offset = ((\memW - 14) & 15) * 32
+ vmovdqu offset(%rsp), W14
+ vpxor W14, W16, W16
+ offset = ((\memW - 8) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ offset = ((\memW - 3) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ vpsrld $(32-1), W16, \regF
+ vpslld $1, W16, W16
+ vpor W16, \regF, \regF
+
+ ROTATE_W
+
+ offset = ((\memW - 0) & 15) * 32
+ vmovdqu \regF, offset(%rsp)
+ vpaddd \regF, \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
+ PROLD \regB,30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+########################################################################
+########################################################################
+########################################################################
+
+## FRAMESZ plus pushes must be an odd multiple of 8
+YMM_SAVE = (15-15)*32
+FRAMESZ = 32*16 + YMM_SAVE
+_YMM = FRAMESZ - YMM_SAVE
+
+#define VMOVPS vmovups
+
+IDX = %rax
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = %rcx
+arg1 = %rdi
+arg2 = %rsi
+RSP_SAVE = %rdx
+
+# ymm0 A
+# ymm1 B
+# ymm2 C
+# ymm3 D
+# ymm4 E
+# ymm5 F AA
+# ymm6 T0 BB
+# ymm7 T1 CC
+# ymm8 T2 DD
+# ymm9 T3 EE
+# ymm10 T4 TMP
+# ymm11 T5 FUN
+# ymm12 T6 K
+# ymm13 T7 W14
+# ymm14 T8 W15
+# ymm15 T9 W16
+
+
+A = %ymm0
+B = %ymm1
+C = %ymm2
+D = %ymm3
+E = %ymm4
+F = %ymm5
+T0 = %ymm6
+T1 = %ymm7
+T2 = %ymm8
+T3 = %ymm9
+T4 = %ymm10
+T5 = %ymm11
+T6 = %ymm12
+T7 = %ymm13
+T8 = %ymm14
+T9 = %ymm15
+
+AA = %ymm5
+BB = %ymm6
+CC = %ymm7
+DD = %ymm8
+EE = %ymm9
+TMP = %ymm10
+FUN = %ymm11
+K = %ymm12
+W14 = %ymm13
+W15 = %ymm14
+W16 = %ymm15
+
+.macro ROTATE_ARGS
+ TMP_ = E
+ E = D
+ D = C
+ C = B
+ B = A
+ A = TMP_
+.endm
+
+.macro ROTATE_W
+TMP_ = W16
+W16 = W15
+W15 = W14
+W14 = TMP_
+.endm
+
+# 8 streams x 5 32bit words per digest x 4 bytes per word
+#define DIGEST_SIZE (8*5*4)
+
+.align 32
+
+# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
+# arg 1 : pointer to array[4] of pointer to input data
+# arg 2 : size (in blocks) ;; assumed to be >= 1
+#
+ENTRY(sha1_x8_avx2)
+
+ # save callee-saved clobbered registers to comply with C function ABI
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ #save rsp
+ mov %rsp, RSP_SAVE
+ sub $FRAMESZ, %rsp
+
+ #align rsp to 32 Bytes
+ and $~0x1F, %rsp
+
+ ## Initialize digests
+ vmovdqu 0*32(arg1), A
+ vmovdqu 1*32(arg1), B
+ vmovdqu 2*32(arg1), C
+ vmovdqu 3*32(arg1), D
+ vmovdqu 4*32(arg1), E
+
+ ## transpose input onto stack
+ mov _data_ptr+0*8(arg1),inp0
+ mov _data_ptr+1*8(arg1),inp1
+ mov _data_ptr+2*8(arg1),inp2
+ mov _data_ptr+3*8(arg1),inp3
+ mov _data_ptr+4*8(arg1),inp4
+ mov _data_ptr+5*8(arg1),inp5
+ mov _data_ptr+6*8(arg1),inp6
+ mov _data_ptr+7*8(arg1),inp7
+
+ xor IDX, IDX
+lloop:
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
+ I=0
+.rep 2
+ VMOVPS (inp0, IDX), T0
+ VMOVPS (inp1, IDX), T1
+ VMOVPS (inp2, IDX), T2
+ VMOVPS (inp3, IDX), T3
+ VMOVPS (inp4, IDX), T4
+ VMOVPS (inp5, IDX), T5
+ VMOVPS (inp6, IDX), T6
+ VMOVPS (inp7, IDX), T7
+
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb F, T0, T0
+ vmovdqu T0, (I*8)*32(%rsp)
+ vpshufb F, T1, T1
+ vmovdqu T1, (I*8+1)*32(%rsp)
+ vpshufb F, T2, T2
+ vmovdqu T2, (I*8+2)*32(%rsp)
+ vpshufb F, T3, T3
+ vmovdqu T3, (I*8+3)*32(%rsp)
+ vpshufb F, T4, T4
+ vmovdqu T4, (I*8+4)*32(%rsp)
+ vpshufb F, T5, T5
+ vmovdqu T5, (I*8+5)*32(%rsp)
+ vpshufb F, T6, T6
+ vmovdqu T6, (I*8+6)*32(%rsp)
+ vpshufb F, T7, T7
+ vmovdqu T7, (I*8+7)*32(%rsp)
+ add $32, IDX
+ I = (I+1)
+.endr
+ # save old digests
+ vmovdqu A,AA
+ vmovdqu B,BB
+ vmovdqu C,CC
+ vmovdqu D,DD
+ vmovdqu E,EE
+
+##
+## perform 0-79 steps
+##
+ vmovdqu K00_19(%rip), K
+## do rounds 0...15
+ I = 0
+.rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 16...19
+ vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
+ vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
+.rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 20...39
+ vmovdqu K20_39(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 40...59
+ vmovdqu K40_59(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 60...79
+ vmovdqu K60_79(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+ vpaddd AA,A,A
+ vpaddd BB,B,B
+ vpaddd CC,C,C
+ vpaddd DD,D,D
+ vpaddd EE,E,E
+
+ sub $1, arg2
+ jne lloop
+
+ # write out digests
+ vmovdqu A, 0*32(arg1)
+ vmovdqu B, 1*32(arg1)
+ vmovdqu C, 2*32(arg1)
+ vmovdqu D, 3*32(arg1)
+ vmovdqu E, 4*32(arg1)
+
+ # update input pointers
+ add IDX, inp0
+ add IDX, inp1
+ add IDX, inp2
+ add IDX, inp3
+ add IDX, inp4
+ add IDX, inp5
+ add IDX, inp6
+ add IDX, inp7
+ mov inp0, _data_ptr (arg1)
+ mov inp1, _data_ptr + 1*8(arg1)
+ mov inp2, _data_ptr + 2*8(arg1)
+ mov inp3, _data_ptr + 3*8(arg1)
+ mov inp4, _data_ptr + 4*8(arg1)
+ mov inp5, _data_ptr + 5*8(arg1)
+ mov inp6, _data_ptr + 6*8(arg1)
+ mov inp7, _data_ptr + 7*8(arg1)
+
+ ################
+ ## Postamble
+
+ mov RSP_SAVE, %rsp
+
+ # restore callee-saved clobbered registers
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ ret
+ENDPROC(sha1_x8_avx2)
+
+
+.section .rodata.cst32.K00_19, "aM", @progbits, 32
+.align 32
+K00_19:
+.octa 0x5A8279995A8279995A8279995A827999
+.octa 0x5A8279995A8279995A8279995A827999
+
+.section .rodata.cst32.K20_39, "aM", @progbits, 32
+.align 32
+K20_39:
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+
+.section .rodata.cst32.K40_59, "aM", @progbits, 32
+.align 32
+K40_59:
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+
+.section .rodata.cst32.K60_79, "aM", @progbits, 32
+.align 32
+K60_79:
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203