summaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/sha256-mb
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/sha256-mb')
-rw-r--r--arch/x86/crypto/sha256-mb/Makefile14
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb.c1013
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_ctx.h134
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr.h108
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S304
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S307
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c65
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S214
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_x8_avx2.S598
9 files changed, 2757 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
new file mode 100644
index 000000000..53ad6e7db
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
+ sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
+ sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
new file mode 100644
index 000000000..97c5fc43e
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -0,0 +1,1013 @@
+/*
+ * Multi buffer SHA256 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha256_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha256_mb_alg_state;
+
+struct sha256_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+ *cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx)
+{
+ struct ahash_request *areq;
+
+ areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+ return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+ *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct ahash_request *areq)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_submit)
+ (struct sha256_mb_mgr *state, struct job_sha256 *job);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_flush)
+ (struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job)
+ (struct sha256_mb_mgr *state);
+
+inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
+ uint64_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA256_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA256_LOG2_BLOCK_SIZE;
+}
+
+static struct sha256_hash_ctx
+ *sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr,
+ struct sha256_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA256_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks =
+ sha256_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha256_hash_ctx
+ *sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to
+ * the user. If it is not ready, resubmit the job to finish processing.
+ * If sha256_ctx_mgr_resubmit returned a job, it is ready to be
+ * returned. Otherwise, all jobs currently being managed by the
+ * hash_ctx_mgr still need processing.
+ */
+ struct sha256_hash_ctx *ctx;
+
+ ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr);
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr)
+{
+ sha256_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
+ struct sha256_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ if (flags & ~(HASH_UPDATE | HASH_LAST)) {
+ /* User should not pass anything other than UPDATE or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ /* If we made it here, there was no error during this call to submit */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /*
+ * Store the user's request flags and mark this ctx as currently
+ * being processed.
+ */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) {
+ /*
+ * Compute how many bytes to copy from user buffer into
+ * extra block
+ */
+ uint32_t copy_len = SHA256_BLOCK_SIZE -
+ ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy(
+ &ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)
+ ((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /* The extra block should never contain more than 1 block */
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ /*
+ * If the extra block buffer contains exactly 1 block,
+ * it can be hashed.
+ */
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr)
+{
+ struct sha256_hash_ctx *ctx;
+
+ while (1) {
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ return NULL;
+
+ /*
+ * If flush returned a job, resubmit the job to finish
+ * processing.
+ */
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha256_ctx_mgr_resubmit returned a job, it is ready to
+ * be returned. Otherwise, all jobs currently being managed by
+ * the sha256_ctx_mgr still need processing. Loop.
+ */
+ if (ctx)
+ return ctx;
+ }
+}
+
+static int sha256_mb_init(struct ahash_request *areq)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA256_H0;
+ sctx->job.result_digest[1] = SHA256_H1;
+ sctx->job.result_digest[2] = SHA256_H2;
+ sctx->job.result_digest[3] = SHA256_H3;
+ sctx->job.result_digest[4] = SHA256_H4;
+ sctx->job.result_digest[5] = SHA256_H5;
+ sctx->job.result_digest[6] = SHA256_H6;
+ sctx->job.result_digest[7] = SHA256_H7;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+ __be32 *dst = (__be32 *) rctx->out;
+
+ for (i = 0; i < 8; ++i)
+ dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha256_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha256_hash_ctx *)
+ ahash_request_ctx(&rctx->areq);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx,
+ rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha256_ctx_mgr_flush(cstate->mgr);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha256_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+
+ /* remove from work list */
+ spin_lock(&cstate->work_lock);
+ list_del(&rctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock(&cstate->work_lock);
+ list_del(&req_ctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ req_ctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ req_ctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+ }
+
+ return 0;
+}
+
+static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock(&cstate->work_lock);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock(&cstate->work_lock);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha256_mb_update(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ sha256_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_finup(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ sha256_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_final(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx,
+ areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ /* flag HASH_FINAL and 0 data size */
+ sha256_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+ HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_export(struct ahash_request *areq, void *out)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_mb_import(struct ahash_request *areq, const void *in)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha256_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ sizeof(struct sha256_hash_ctx));
+
+ return 0;
+}
+
+static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha256_mb_areq_alg = {
+ .init = sha256_mb_init,
+ .update = sha256_mb_update,
+ .final = sha256_mb_final,
+ .finup = sha256_mb_finup,
+ .export = sha256_mb_export,
+ .import = sha256_mb_import,
+ .halg = {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .statesize = sizeof(struct sha256_hash_ctx),
+ .base = {
+ .cra_name = "__sha256-mb",
+ .cra_driver_name = "__intel_sha256-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread
+ * sleep
+ */
+ .cra_flags = CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha256_mb_areq_alg.halg.base.cra_list),
+ .cra_init = sha256_mb_areq_init_tfm,
+ .cra_exit = sha256_mb_areq_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha256_hash_ctx),
+ }
+ }
+};
+
+static int sha256_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha256_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha256_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha256_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha256_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha256_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha256_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct ahash_request *areq;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ areq = &rctx->areq;
+
+ ahash_request_set_tfm(areq, child);
+ ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+ rctx->complete, req);
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha256_mb_async_alg = {
+ .init = sha256_mb_async_init,
+ .update = sha256_mb_async_update,
+ .final = sha256_mb_async_final,
+ .finup = sha256_mb_async_finup,
+ .export = sha256_mb_async_export,
+ .import = sha256_mb_async_import,
+ .digest = sha256_mb_async_digest,
+ .halg = {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .statesize = sizeof(struct sha256_hash_ctx),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256_mb",
+ /*
+ * Low priority, since with few concurrent hash requests
+ * this is extremely slow due to the flush delay. Users
+ * whose workloads would benefit from this can request
+ * it explicitly by driver name, or can increase its
+ * priority at runtime using NETLINK_CRYPTO.
+ */
+ .cra_priority = 50,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha256_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha256_mb_async_init_tfm,
+ .cra_exit = sha256_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha256_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha256_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if (time_before(cur_time, rctx->tag.expire))
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha256_hash_ctx *)
+ sha256_ctx_mgr_flush(cstate->mgr);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha256_mb error: nothing got"
+ " flushed for non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha256_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha256_mb_alg_state.alg_cstate = alloc_percpu
+ (struct mcryptd_alg_cstate);
+
+ sha256_job_mgr_init = sha256_mb_mgr_init_avx2;
+ sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2;
+ sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2;
+ sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2;
+
+ if (!sha256_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha256_mb_alg_state;
+ cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr),
+ GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha256_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha256_mb_alg_state.flusher = &sha256_mb_flusher;
+
+ err = crypto_register_ahash(&sha256_mb_areq_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha256_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_ahash(&sha256_mb_areq_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha256_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha256_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha256_mb_async_alg);
+ crypto_unregister_ahash(&sha256_mb_areq_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha256_mb_alg_state.alg_cstate);
+}
+
+module_init(sha256_mb_mod_init);
+module_exit(sha256_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
new file mode 100644
index 000000000..7c432543d
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for multi buffer SHA256 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha256_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_LAST 0x01
+#define HASH_DONE 0x02
+#define HASH_FINAL 0x04
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+
+#ifdef HASH_CTX_DEBUG
+ HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA256_DIGEST_LENGTH 8
+#define SHA256_LOG2_BLOCK_SIZE 6
+
+#define SHA256_PADLENGTHFIELD_SIZE 8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha256_ctx_mgr {
+ struct sha256_mb_mgr mgr;
+};
+
+/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
+
+struct sha256_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha256 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint64_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
new file mode 100644
index 000000000..b01ae408c
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
@@ -0,0 +1,108 @@
+/*
+ * Header file for multi buffer SHA256 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA256_DIGEST_WORDS 8
+
+enum job_sts { STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha256 {
+ u8 *buffer;
+ u32 len;
+ u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+/* SHA256 out-of-order scheduler */
+
+/* typedef uint32_t sha8_digest_array[8][8]; */
+
+struct sha256_args_x8 {
+ uint32_t digest[8][8];
+ uint8_t *data_ptr[8];
+};
+
+struct sha256_lane_data {
+ struct job_sha256 *job_in_lane;
+};
+
+struct sha256_mb_mgr {
+ struct sha256_args_x8 args;
+
+ uint32_t lens[8];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha256_lane_data ldata[8];
+};
+
+
+#define SHA256_MB_MGR_NUM_LANES_AVX2 8
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
+ struct job_sha256 *job);
+struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
new file mode 100644
index 000000000..5c377bac2
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
@@ -0,0 +1,304 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define SZ8 8*SHA256_DIGEST_WORD_SIZE
+#define ROUNDS 64*SZ8
+#define PTR_SZ 8
+#define SHA256_DIGEST_WORD_SIZE 4
+#define MAX_SHA256_LANES 8
+#define SHA256_DIGEST_WORDS 8
+#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
+#define SHA256_BLK_SZ 64
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+
+########################################################################
+#### Define SHA256 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # SHA256_ARGS_X4
+### name size align
+FIELD _digest, 4*8*8, 4 # transposed digest
+FIELD _data_ptr, 8*8, 8 # array of pointers to data
+END_FIELDS
+
+ _SHA256_ARGS_X4_size = _FIELD_OFFSET
+ _SHA256_ARGS_X4_align = _STRUCT_ALIGN
+ _SHA256_ARGS_X8_size = _FIELD_OFFSET
+ _SHA256_ARGS_X8_align = _STRUCT_ALIGN
+
+#######################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD _lens, 4*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size = _FIELD_OFFSET
+ _MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+#######################################################################
+
+START_FIELDS #STACK_FRAME
+### name size align
+FIELD _data, 16*SZ8, 1 # transposed digest
+FIELD _digest, 8*SZ8, 1 # array of pointers to data
+FIELD _ytmp, 4*SZ8, 1
+FIELD _rsp, 8, 1
+END_FIELDS
+
+ _STACK_FRAME_size = _FIELD_OFFSET
+ _STACK_FRAME_align = _STRUCT_ALIGN
+
+#######################################################################
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+########################################################################
+#### Define JOB_SHA256 structure
+########################################################################
+
+START_FIELDS # JOB_SHA256
+
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 8, 8 # length in bytes
+FIELD _result_digest, 8*4, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+ _JOB_SHA256_size = _FIELD_OFFSET
+ _JOB_SHA256_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000..d2364c55b
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -0,0 +1,307 @@
+/*
+ * Flush routine for SHA256 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+#LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# Common register definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+# idx must be a register not clobberred by sha1_mult
+#define idx %r8
+#define DWORD_idx %r8d
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+#define tmp2_w %ebx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 %arg1
+
+#define extra_blocks %arg2
+#define p %arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha256_mb_mgr_flush_avx2)
+ FRAME_BEGIN
+ push %rbx
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+3, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+ offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne four(%rip), idx
+ offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne five(%rip), idx
+ offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne six(%rip), idx
+ offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne seven(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 8
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 4*I)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ vmovdqu _lens+0*16(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqu %xmm0, _lens+0*16(state)
+ vmovdqu %xmm1, _lens+1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha256_x8_avx2
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+
+ mov unused_lanes, _unused_lanes(state)
+ movl $0xFFFFFFFF, _lens(state,idx,4)
+
+ vmovd _args_digest(state , idx, 4) , %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+ vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ vmovdqu %xmm1, offset(job_rax)
+
+return:
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha256_mb_mgr_flush_avx2)
+
+##############################################################################
+
+.align 16
+ENTRY(sha256_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ ## if bit 32+3 is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+3), unused_lanes
+ jc .return_null
+
+ # Find min length
+ vmovdqu _lens(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ test $~0xF, idx
+ jnz .return_null
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+ vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ vmovdqu %xmm1, offset(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+.octa 0x000000000000000000000000FFFFFFF0
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+one:
+.quad 1
+two:
+.quad 2
+three:
+.quad 3
+four:
+.quad 4
+five:
+.quad 5
+six:
+.quad 6
+seven:
+.quad 7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b0c498371
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,65 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha256_mb_mgr.h"
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
+{
+ unsigned int j;
+
+ state->unused_lanes = 0xF76543210ULL;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = NULL;
+ }
+}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000..b36ae7454
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -0,0 +1,214 @@
+/*
+ * Buffer submit code for multi buffer SHA256 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+# LINUX register definitions
+arg1 = %rdi
+arg2 = %rsi
+size_offset = %rcx
+tmp2 = %rcx
+extra_blocks = %rdx
+
+# Common definitions
+#define state arg1
+#define job %rsi
+#define len2 arg2
+#define p2 arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx = %r8
+DWORD_idx = %r8d
+last_len = %r8
+
+p = %r11
+start_offset = %r11
+
+unused_lanes = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax = %rax
+len = %rax
+DWORD_len = %eax
+
+lane = %r12
+tmp3 = %r12
+
+tmp = %r9
+DWORD_tmp = %r9d
+
+lane_data = %r10
+
+# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha256_mb_mgr_submit_avx2)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
+
+ mov _unused_lanes(state), unused_lanes
+ mov unused_lanes, lane
+ and $0xF, lane
+ shr $4, unused_lanes
+ imul $_LANE_DATA_size, lane, lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ shl $4, len
+ or lane, len
+
+ movl DWORD_len, _lens(state , lane, 4)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest(job), %xmm0
+ vmovdqu _result_digest+1*16(job), %xmm1
+ vmovd %xmm0, _args_digest(state, lane, 4)
+ vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
+ vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
+ vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
+ vmovd %xmm1, _args_digest+4*32(state , lane, 4)
+
+ vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
+ vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
+ vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xF, unused_lanes
+ jne return_null
+
+start_loop:
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens + 0*16(state)
+ vmovdqa %xmm1, _lens + 1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha256_x8_avx2
+
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state,idx,4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+
+ vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ vmovdqu %xmm1, _result_digest+1*16(job_rax)
+
+return:
+ pop %r12
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ENDPROC(sha256_mb_mgr_submit_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+ .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
new file mode 100644
index 000000000..1687c80c5
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -0,0 +1,598 @@
+/*
+ * Multi-buffer SHA256 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+## code to compute oct SHA256 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+## Logic designed/laid out by JDG
+
+## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15
+## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+## Linux preserves: rdi rbp r8
+##
+## clobbers %ymm0-15
+
+arg1 = %rdi
+arg2 = %rsi
+reg3 = %rcx
+reg4 = %rdx
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = reg3
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = reg4
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+T1 = %ymm8
+
+a0 = %ymm12
+a1 = %ymm13
+a2 = %ymm14
+TMP = %ymm15
+TMP0 = %ymm6
+TMP1 = %ymm7
+
+TT0 = %ymm8
+TT1 = %ymm9
+TT2 = %ymm10
+TT3 = %ymm11
+TT4 = %ymm12
+TT5 = %ymm13
+TT6 = %ymm14
+TT7 = %ymm15
+
+# Define stack usage
+
+# Assume stack aligned to 32 bytes before call
+# Therefore FRAMESZ mod 32 must be 32-8 = 24
+
+#define FRAMESZ 0x388
+
+#define VMOVPS vmovups
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+ # process top half (r0..r3) {a...d}
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ # use r2 in place of t0
+ # process bottom half (r4..r7) {e...h}
+ vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
+ vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
+ vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
+ vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
+ vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
+ vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
+ vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
+ vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
+
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro _PRORD reg imm tmp
+ vpslld $(32-\imm),\reg,\tmp
+ vpsrld $\imm,\reg, \reg
+ vpor \tmp,\reg, \reg
+.endm
+
+# PRORD_nd reg, imm, tmp, src
+.macro _PRORD_nd reg imm tmp src
+ vpslld $(32-\imm), \src, \tmp
+ vpsrld $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+# PRORD dst/src, amt
+.macro PRORD reg imm
+ _PRORD \reg,\imm,TMP
+.endm
+
+# PRORD_nd dst, src, amt
+.macro PRORD_nd reg tmp imm
+ _PRORD_nd \reg, \imm, TMP, \tmp
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+ PRORD_nd a0,e,5 # sig1: a0 = (e >> 5)
+
+ vpxor g, f, a2 # ch: a2 = f^g
+ vpand e,a2, a2 # ch: a2 = (f^g)&e
+ vpxor g, a2, a2 # a2 = ch
+
+ PRORD_nd a1,e,25 # sig1: a1 = (e >> 25)
+
+ vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp)
+ vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
+ vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd a2, h, h # h = h + ch
+ PRORD_nd a2,a,11 # sig0: a2 = (a >> 11)
+ vpaddd \_T1,h, h # h = h + ch + W + K
+ vpxor a1, a0, a0 # a0 = sigma1
+ PRORD_nd a1,a,22 # sig0: a1 = (a >> 22)
+ vpxor c, a, \_T1 # maj: T1 = a^c
+ add $SZ8, ROUND # ROUND++
+ vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
+ vpaddd a0, h, h
+ vpaddd h, d, d
+ vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
+ PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a1, a2, a2 # a2 = sig0
+ vpand c, a, a1 # maj: a1 = a&c
+ vpor \_T1, a1, a1 # a1 = maj
+ vpaddd a1, h, h # h = h + ch + W + K + maj
+ vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+ vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1
+ vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1
+ vmovdqu \_T1, a0
+ PRORD \_T1,11
+ vmovdqu a1, a2
+ PRORD a1,2
+ vpxor a0, \_T1, \_T1
+ PRORD \_T1, 7
+ vpxor a2, a1, a1
+ PRORD a1, 17
+ vpsrld $3, a0, a0
+ vpxor a0, \_T1, \_T1
+ vpsrld $10, a2, a2
+ vpxor a2, a1, a1
+ vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
+ vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1
+ vpaddd a1, \_T1, \_T1
+
+ ROUND_00_15 \_T1,\i
+.endm
+
+# SHA256_ARGS:
+# UINT128 digest[8]; // transposed digests
+# UINT8 *data_ptr[4];
+
+# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
+# arg 1 : STATE : pointer to array of pointers to input data
+# arg 2 : INP_SIZE : size of input in blocks
+ # general registers preserved in outer calling routine
+ # outer calling routine saves all the XMM registers
+ # save rsp, allocate 32-byte aligned for local variables
+ENTRY(sha256_x8_avx2)
+
+ # save callee-saved clobbered registers to comply with C function ABI
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, IDX
+ sub $FRAMESZ, %rsp
+ and $~0x1F, %rsp
+ mov IDX, _rsp(%rsp)
+
+ # Load the pre-transposed incoming digest.
+ vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a
+ vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b
+ vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c
+ vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d
+ vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e
+ vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f
+ vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g
+ vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h
+
+ lea K256_8(%rip),TBL
+
+ # load the address of each of the 4 message lanes
+ # getting ready to transpose input onto stack
+ mov _args_data_ptr+0*PTR_SZ(STATE),inp0
+ mov _args_data_ptr+1*PTR_SZ(STATE),inp1
+ mov _args_data_ptr+2*PTR_SZ(STATE),inp2
+ mov _args_data_ptr+3*PTR_SZ(STATE),inp3
+ mov _args_data_ptr+4*PTR_SZ(STATE),inp4
+ mov _args_data_ptr+5*PTR_SZ(STATE),inp5
+ mov _args_data_ptr+6*PTR_SZ(STATE),inp6
+ mov _args_data_ptr+7*PTR_SZ(STATE),inp7
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ # save old digest
+ vmovdqu a, _digest(%rsp)
+ vmovdqu b, _digest+1*SZ8(%rsp)
+ vmovdqu c, _digest+2*SZ8(%rsp)
+ vmovdqu d, _digest+3*SZ8(%rsp)
+ vmovdqu e, _digest+4*SZ8(%rsp)
+ vmovdqu f, _digest+5*SZ8(%rsp)
+ vmovdqu g, _digest+6*SZ8(%rsp)
+ vmovdqu h, _digest+7*SZ8(%rsp)
+ i = 0
+.rep 2
+ VMOVPS i*32(inp0, IDX), TT0
+ VMOVPS i*32(inp1, IDX), TT1
+ VMOVPS i*32(inp2, IDX), TT2
+ VMOVPS i*32(inp3, IDX), TT3
+ VMOVPS i*32(inp4, IDX), TT4
+ VMOVPS i*32(inp5, IDX), TT5
+ VMOVPS i*32(inp6, IDX), TT6
+ VMOVPS i*32(inp7, IDX), TT7
+ vmovdqu g, _ytmp(%rsp)
+ vmovdqu h, _ytmp+1*SZ8(%rsp)
+ TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
+ vmovdqu _ytmp(%rsp), g
+ vpshufb TMP1, TT0, TT0
+ vpshufb TMP1, TT1, TT1
+ vpshufb TMP1, TT2, TT2
+ vpshufb TMP1, TT3, TT3
+ vpshufb TMP1, TT4, TT4
+ vpshufb TMP1, TT5, TT5
+ vpshufb TMP1, TT6, TT6
+ vpshufb TMP1, TT7, TT7
+ vmovdqu _ytmp+1*SZ8(%rsp), h
+ vmovdqu TT4, _ytmp(%rsp)
+ vmovdqu TT5, _ytmp+1*SZ8(%rsp)
+ vmovdqu TT6, _ytmp+2*SZ8(%rsp)
+ vmovdqu TT7, _ytmp+3*SZ8(%rsp)
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqu _ytmp(%rsp), TT0
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqu _ytmp+1*SZ8(%rsp), TT1
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqu _ytmp+2*SZ8(%rsp), TT2
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqu _ytmp+3*SZ8(%rsp), TT3
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+ i = (i+1)
+.endr
+ add $64, IDX
+ i = (i*8)
+
+ jmp Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+ ROUND_16_XX T1, i
+ i = (i+1)
+.endr
+
+ cmp $ROUNDS,ROUND
+ jb Lrounds_16_xx
+
+ # add old digest
+ vpaddd _digest+0*SZ8(%rsp), a, a
+ vpaddd _digest+1*SZ8(%rsp), b, b
+ vpaddd _digest+2*SZ8(%rsp), c, c
+ vpaddd _digest+3*SZ8(%rsp), d, d
+ vpaddd _digest+4*SZ8(%rsp), e, e
+ vpaddd _digest+5*SZ8(%rsp), f, f
+ vpaddd _digest+6*SZ8(%rsp), g, g
+ vpaddd _digest+7*SZ8(%rsp), h, h
+
+ sub $1, INP_SIZE # unit is blocks
+ jne lloop
+
+ # write back to memory (state object) the transposed digest
+ vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
+
+ # update input pointers
+ add IDX, inp0
+ mov inp0, _args_data_ptr+0*8(STATE)
+ add IDX, inp1
+ mov inp1, _args_data_ptr+1*8(STATE)
+ add IDX, inp2
+ mov inp2, _args_data_ptr+2*8(STATE)
+ add IDX, inp3
+ mov inp3, _args_data_ptr+3*8(STATE)
+ add IDX, inp4
+ mov inp4, _args_data_ptr+4*8(STATE)
+ add IDX, inp5
+ mov inp5, _args_data_ptr+5*8(STATE)
+ add IDX, inp6
+ mov inp6, _args_data_ptr+6*8(STATE)
+ add IDX, inp7
+ mov inp7, _args_data_ptr+7*8(STATE)
+
+ # Postamble
+ mov _rsp(%rsp), %rsp
+
+ # restore callee-saved clobbered registers
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ ret
+ENDPROC(sha256_x8_avx2)
+
+.section .rodata.K256_8, "a", @progbits
+.align 64
+K256_8:
+ .octa 0x428a2f98428a2f98428a2f98428a2f98
+ .octa 0x428a2f98428a2f98428a2f98428a2f98
+ .octa 0x71374491713744917137449171374491
+ .octa 0x71374491713744917137449171374491
+ .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+ .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+ .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+ .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+ .octa 0x3956c25b3956c25b3956c25b3956c25b
+ .octa 0x3956c25b3956c25b3956c25b3956c25b
+ .octa 0x59f111f159f111f159f111f159f111f1
+ .octa 0x59f111f159f111f159f111f159f111f1
+ .octa 0x923f82a4923f82a4923f82a4923f82a4
+ .octa 0x923f82a4923f82a4923f82a4923f82a4
+ .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+ .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+ .octa 0xd807aa98d807aa98d807aa98d807aa98
+ .octa 0xd807aa98d807aa98d807aa98d807aa98
+ .octa 0x12835b0112835b0112835b0112835b01
+ .octa 0x12835b0112835b0112835b0112835b01
+ .octa 0x243185be243185be243185be243185be
+ .octa 0x243185be243185be243185be243185be
+ .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
+ .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
+ .octa 0x72be5d7472be5d7472be5d7472be5d74
+ .octa 0x72be5d7472be5d7472be5d7472be5d74
+ .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
+ .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
+ .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
+ .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
+ .octa 0xc19bf174c19bf174c19bf174c19bf174
+ .octa 0xc19bf174c19bf174c19bf174c19bf174
+ .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
+ .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
+ .octa 0xefbe4786efbe4786efbe4786efbe4786
+ .octa 0xefbe4786efbe4786efbe4786efbe4786
+ .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
+ .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
+ .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
+ .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
+ .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
+ .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
+ .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
+ .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
+ .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+ .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+ .octa 0x76f988da76f988da76f988da76f988da
+ .octa 0x76f988da76f988da76f988da76f988da
+ .octa 0x983e5152983e5152983e5152983e5152
+ .octa 0x983e5152983e5152983e5152983e5152
+ .octa 0xa831c66da831c66da831c66da831c66d
+ .octa 0xa831c66da831c66da831c66da831c66d
+ .octa 0xb00327c8b00327c8b00327c8b00327c8
+ .octa 0xb00327c8b00327c8b00327c8b00327c8
+ .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
+ .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
+ .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+ .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+ .octa 0xd5a79147d5a79147d5a79147d5a79147
+ .octa 0xd5a79147d5a79147d5a79147d5a79147
+ .octa 0x06ca635106ca635106ca635106ca6351
+ .octa 0x06ca635106ca635106ca635106ca6351
+ .octa 0x14292967142929671429296714292967
+ .octa 0x14292967142929671429296714292967
+ .octa 0x27b70a8527b70a8527b70a8527b70a85
+ .octa 0x27b70a8527b70a8527b70a8527b70a85
+ .octa 0x2e1b21382e1b21382e1b21382e1b2138
+ .octa 0x2e1b21382e1b21382e1b21382e1b2138
+ .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+ .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+ .octa 0x53380d1353380d1353380d1353380d13
+ .octa 0x53380d1353380d1353380d1353380d13
+ .octa 0x650a7354650a7354650a7354650a7354
+ .octa 0x650a7354650a7354650a7354650a7354
+ .octa 0x766a0abb766a0abb766a0abb766a0abb
+ .octa 0x766a0abb766a0abb766a0abb766a0abb
+ .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
+ .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
+ .octa 0x92722c8592722c8592722c8592722c85
+ .octa 0x92722c8592722c8592722c8592722c85
+ .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+ .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+ .octa 0xa81a664ba81a664ba81a664ba81a664b
+ .octa 0xa81a664ba81a664ba81a664ba81a664b
+ .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
+ .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
+ .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
+ .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
+ .octa 0xd192e819d192e819d192e819d192e819
+ .octa 0xd192e819d192e819d192e819d192e819
+ .octa 0xd6990624d6990624d6990624d6990624
+ .octa 0xd6990624d6990624d6990624d6990624
+ .octa 0xf40e3585f40e3585f40e3585f40e3585
+ .octa 0xf40e3585f40e3585f40e3585f40e3585
+ .octa 0x106aa070106aa070106aa070106aa070
+ .octa 0x106aa070106aa070106aa070106aa070
+ .octa 0x19a4c11619a4c11619a4c11619a4c116
+ .octa 0x19a4c11619a4c11619a4c11619a4c116
+ .octa 0x1e376c081e376c081e376c081e376c08
+ .octa 0x1e376c081e376c081e376c081e376c08
+ .octa 0x2748774c2748774c2748774c2748774c
+ .octa 0x2748774c2748774c2748774c2748774c
+ .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
+ .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
+ .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
+ .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
+ .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+ .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+ .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+ .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+ .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
+ .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
+ .octa 0x748f82ee748f82ee748f82ee748f82ee
+ .octa 0x748f82ee748f82ee748f82ee748f82ee
+ .octa 0x78a5636f78a5636f78a5636f78a5636f
+ .octa 0x78a5636f78a5636f78a5636f78a5636f
+ .octa 0x84c8781484c8781484c8781484c87814
+ .octa 0x84c8781484c8781484c8781484c87814
+ .octa 0x8cc702088cc702088cc702088cc70208
+ .octa 0x8cc702088cc702088cc702088cc70208
+ .octa 0x90befffa90befffa90befffa90befffa
+ .octa 0x90befffa90befffa90befffa90befffa
+ .octa 0xa4506ceba4506ceba4506ceba4506ceb
+ .octa 0xa4506ceba4506ceba4506ceba4506ceb
+ .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+ .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+ .octa 0xc67178f2c67178f2c67178f2c67178f2
+ .octa 0xc67178f2c67178f2c67178f2c67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+.global K256
+K256:
+ .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2