summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/crypto')
-rw-r--r--arch/powerpc/crypto/Makefile23
-rw-r--r--arch/powerpc/crypto/aes-spe-core.S351
-rw-r--r--arch/powerpc/crypto/aes-spe-glue.c519
-rw-r--r--arch/powerpc/crypto/aes-spe-keys.S283
-rw-r--r--arch/powerpc/crypto/aes-spe-modes.S630
-rw-r--r--arch/powerpc/crypto/aes-spe-regs.h42
-rw-r--r--arch/powerpc/crypto/aes-tab-4k.S331
-rw-r--r--arch/powerpc/crypto/crc-vpmsum_test.c137
-rw-r--r--arch/powerpc/crypto/crc32-vpmsum_core.S755
-rw-r--r--arch/powerpc/crypto/crc32c-vpmsum_asm.S846
-rw-r--r--arch/powerpc/crypto/crc32c-vpmsum_glue.c172
-rw-r--r--arch/powerpc/crypto/crct10dif-vpmsum_asm.S850
-rw-r--r--arch/powerpc/crypto/crct10dif-vpmsum_glue.c128
-rw-r--r--arch/powerpc/crypto/md5-asm.S244
-rw-r--r--arch/powerpc/crypto/md5-glue.c164
-rw-r--r--arch/powerpc/crypto/sha1-powerpc-asm.S190
-rw-r--r--arch/powerpc/crypto/sha1-spe-asm.S299
-rw-r--r--arch/powerpc/crypto/sha1-spe-glue.c210
-rw-r--r--arch/powerpc/crypto/sha1.c157
-rw-r--r--arch/powerpc/crypto/sha256-spe-asm.S323
-rw-r--r--arch/powerpc/crypto/sha256-spe-glue.c274
21 files changed, 6928 insertions, 0 deletions
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
new file mode 100644
index 000000000..4808d97fe
--- /dev/null
+++ b/arch/powerpc/crypto/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# powerpc/crypto/Makefile
+#
+# Arch-specific CryptoAPI modules.
+#
+
+obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
+obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
+obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
+obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
+obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
+obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
+obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
+
+aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
+md5-ppc-y := md5-asm.o md5-glue.o
+sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
+sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
+sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
+crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
+crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
diff --git a/arch/powerpc/crypto/aes-spe-core.S b/arch/powerpc/crypto/aes-spe-core.S
new file mode 100644
index 000000000..bc6ff43a9
--- /dev/null
+++ b/arch/powerpc/crypto/aes-spe-core.S
@@ -0,0 +1,351 @@
+/*
+ * Fast AES implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+#include "aes-spe-regs.h"
+
+#define EAD(in, bpos) \
+ rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
+
+#define DAD(in, bpos) \
+ rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
+
+#define LWH(out, off) \
+ evlwwsplat out,off(rT0); /* load word high */
+
+#define LWL(out, off) \
+ lwz out,off(rT0); /* load word low */
+
+#define LBZ(out, tab, off) \
+ lbz out,off(tab); /* load byte */
+
+#define LAH(out, in, bpos, off) \
+ EAD(in, bpos) /* calc addr + load word high */ \
+ LWH(out, off)
+
+#define LAL(out, in, bpos, off) \
+ EAD(in, bpos) /* calc addr + load word low */ \
+ LWL(out, off)
+
+#define LAE(out, in, bpos) \
+ EAD(in, bpos) /* calc addr + load enc byte */ \
+ LBZ(out, rT0, 8)
+
+#define LBE(out) \
+ LBZ(out, rT0, 8) /* load enc byte */
+
+#define LAD(out, in, bpos) \
+ DAD(in, bpos) /* calc addr + load dec byte */ \
+ LBZ(out, rT1, 0)
+
+#define LBD(out) \
+ LBZ(out, rT1, 0)
+
+/*
+ * ppc_encrypt_block: The central encryption function for a single 16 bytes
+ * block. It does no stack handling or register saving to support fast calls
+ * via bl/blr. It expects that caller has pre-xored input data with first
+ * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
+ * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
+ * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
+ *
+ */
+_GLOBAL(ppc_encrypt_block)
+ LAH(rW4, rD1, 2, 4)
+ LAH(rW6, rD0, 3, 0)
+ LAH(rW3, rD0, 1, 8)
+ppc_encrypt_block_loop:
+ LAH(rW0, rD3, 0, 12)
+ LAL(rW0, rD0, 0, 12)
+ LAH(rW1, rD1, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAL(rW3, rD1, 1, 8)
+ LAL(rW4, rD2, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAH(rW5, rD3, 2, 4)
+ LAL(rW5, rD0, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ evldw rD1,16(rKP)
+ EAD(rD3, 3)
+ evxor rW2,rW2,rW4
+ LWL(rW7, 0)
+ evxor rW2,rW2,rW6
+ EAD(rD2, 0)
+ evxor rD1,rD1,rW2
+ LWL(rW1, 12)
+ evxor rD1,rD1,rW0
+ evldw rD3,24(rKP)
+ evmergehi rD0,rD0,rD1
+ EAD(rD1, 2)
+ evxor rW3,rW3,rW5
+ LWH(rW4, 4)
+ evxor rW3,rW3,rW7
+ EAD(rD0, 3)
+ evxor rD3,rD3,rW3
+ LWH(rW6, 0)
+ evxor rD3,rD3,rW1
+ EAD(rD0, 1)
+ evmergehi rD2,rD2,rD3
+ LWH(rW3, 8)
+ LAH(rW0, rD3, 0, 12)
+ LAL(rW0, rD0, 0, 12)
+ LAH(rW1, rD1, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAL(rW3, rD1, 1, 8)
+ LAL(rW4, rD2, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAH(rW5, rD3, 2, 4)
+ LAL(rW5, rD0, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ evldw rD1,32(rKP)
+ EAD(rD3, 3)
+ evxor rW2,rW2,rW4
+ LWL(rW7, 0)
+ evxor rW2,rW2,rW6
+ EAD(rD2, 0)
+ evxor rD1,rD1,rW2
+ LWL(rW1, 12)
+ evxor rD1,rD1,rW0
+ evldw rD3,40(rKP)
+ evmergehi rD0,rD0,rD1
+ EAD(rD1, 2)
+ evxor rW3,rW3,rW5
+ LWH(rW4, 4)
+ evxor rW3,rW3,rW7
+ EAD(rD0, 3)
+ evxor rD3,rD3,rW3
+ LWH(rW6, 0)
+ evxor rD3,rD3,rW1
+ EAD(rD0, 1)
+ evmergehi rD2,rD2,rD3
+ LWH(rW3, 8)
+ addi rKP,rKP,32
+ bdnz ppc_encrypt_block_loop
+ LAH(rW0, rD3, 0, 12)
+ LAL(rW0, rD0, 0, 12)
+ LAH(rW1, rD1, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAL(rW3, rD1, 1, 8)
+ LAL(rW4, rD2, 2, 4)
+ LAH(rW5, rD3, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAL(rW5, rD0, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ evldw rD1,16(rKP)
+ EAD(rD3, 3)
+ evxor rW2,rW2,rW4
+ LWL(rW7, 0)
+ evxor rW2,rW2,rW6
+ EAD(rD2, 0)
+ evxor rD1,rD1,rW2
+ LWL(rW1, 12)
+ evxor rD1,rD1,rW0
+ evldw rD3,24(rKP)
+ evmergehi rD0,rD0,rD1
+ EAD(rD1, 0)
+ evxor rW3,rW3,rW5
+ LBE(rW2)
+ evxor rW3,rW3,rW7
+ EAD(rD0, 1)
+ evxor rD3,rD3,rW3
+ LBE(rW6)
+ evxor rD3,rD3,rW1
+ EAD(rD0, 0)
+ evmergehi rD2,rD2,rD3
+ LBE(rW1)
+ LAE(rW0, rD3, 0)
+ LAE(rW1, rD0, 0)
+ LAE(rW4, rD2, 1)
+ LAE(rW5, rD3, 1)
+ LAE(rW3, rD2, 0)
+ LAE(rW7, rD1, 1)
+ rlwimi rW0,rW4,8,16,23
+ rlwimi rW1,rW5,8,16,23
+ LAE(rW4, rD1, 2)
+ LAE(rW5, rD2, 2)
+ rlwimi rW2,rW6,8,16,23
+ rlwimi rW3,rW7,8,16,23
+ LAE(rW6, rD3, 2)
+ LAE(rW7, rD0, 2)
+ rlwimi rW0,rW4,16,8,15
+ rlwimi rW1,rW5,16,8,15
+ LAE(rW4, rD0, 3)
+ LAE(rW5, rD1, 3)
+ rlwimi rW2,rW6,16,8,15
+ lwz rD0,32(rKP)
+ rlwimi rW3,rW7,16,8,15
+ lwz rD1,36(rKP)
+ LAE(rW6, rD2, 3)
+ LAE(rW7, rD3, 3)
+ rlwimi rW0,rW4,24,0,7
+ lwz rD2,40(rKP)
+ rlwimi rW1,rW5,24,0,7
+ lwz rD3,44(rKP)
+ rlwimi rW2,rW6,24,0,7
+ rlwimi rW3,rW7,24,0,7
+ blr
+
+/*
+ * ppc_decrypt_block: The central decryption function for a single 16 bytes
+ * block. It does no stack handling or register saving to support fast calls
+ * via bl/blr. It expects that caller has pre-xored input data with first
+ * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
+ * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
+ * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
+ *
+ */
+_GLOBAL(ppc_decrypt_block)
+ LAH(rW0, rD1, 0, 12)
+ LAH(rW6, rD0, 3, 0)
+ LAH(rW3, rD0, 1, 8)
+ppc_decrypt_block_loop:
+ LAH(rW1, rD3, 0, 12)
+ LAL(rW0, rD2, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAH(rW4, rD3, 2, 4)
+ LAL(rW4, rD0, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAH(rW5, rD1, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ LAL(rW7, rD3, 3, 0)
+ LAL(rW3, rD1, 1, 8)
+ evldw rD1,16(rKP)
+ EAD(rD0, 0)
+ evxor rW4,rW4,rW6
+ LWL(rW1, 12)
+ evxor rW0,rW0,rW4
+ EAD(rD2, 2)
+ evxor rW0,rW0,rW2
+ LWL(rW5, 4)
+ evxor rD1,rD1,rW0
+ evldw rD3,24(rKP)
+ evmergehi rD0,rD0,rD1
+ EAD(rD1, 0)
+ evxor rW3,rW3,rW7
+ LWH(rW0, 12)
+ evxor rW3,rW3,rW1
+ EAD(rD0, 3)
+ evxor rD3,rD3,rW3
+ LWH(rW6, 0)
+ evxor rD3,rD3,rW5
+ EAD(rD0, 1)
+ evmergehi rD2,rD2,rD3
+ LWH(rW3, 8)
+ LAH(rW1, rD3, 0, 12)
+ LAL(rW0, rD2, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAH(rW4, rD3, 2, 4)
+ LAL(rW4, rD0, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAH(rW5, rD1, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ LAL(rW7, rD3, 3, 0)
+ LAL(rW3, rD1, 1, 8)
+ evldw rD1,32(rKP)
+ EAD(rD0, 0)
+ evxor rW4,rW4,rW6
+ LWL(rW1, 12)
+ evxor rW0,rW0,rW4
+ EAD(rD2, 2)
+ evxor rW0,rW0,rW2
+ LWL(rW5, 4)
+ evxor rD1,rD1,rW0
+ evldw rD3,40(rKP)
+ evmergehi rD0,rD0,rD1
+ EAD(rD1, 0)
+ evxor rW3,rW3,rW7
+ LWH(rW0, 12)
+ evxor rW3,rW3,rW1
+ EAD(rD0, 3)
+ evxor rD3,rD3,rW3
+ LWH(rW6, 0)
+ evxor rD3,rD3,rW5
+ EAD(rD0, 1)
+ evmergehi rD2,rD2,rD3
+ LWH(rW3, 8)
+ addi rKP,rKP,32
+ bdnz ppc_decrypt_block_loop
+ LAH(rW1, rD3, 0, 12)
+ LAL(rW0, rD2, 0, 12)
+ LAH(rW2, rD2, 1, 8)
+ LAL(rW2, rD3, 1, 8)
+ LAH(rW4, rD3, 2, 4)
+ LAL(rW4, rD0, 2, 4)
+ LAL(rW6, rD1, 3, 0)
+ LAH(rW5, rD1, 2, 4)
+ LAH(rW7, rD2, 3, 0)
+ LAL(rW7, rD3, 3, 0)
+ LAL(rW3, rD1, 1, 8)
+ evldw rD1,16(rKP)
+ EAD(rD0, 0)
+ evxor rW4,rW4,rW6
+ LWL(rW1, 12)
+ evxor rW0,rW0,rW4
+ EAD(rD2, 2)
+ evxor rW0,rW0,rW2
+ LWL(rW5, 4)
+ evxor rD1,rD1,rW0
+ evldw rD3,24(rKP)
+ evmergehi rD0,rD0,rD1
+ DAD(rD1, 0)
+ evxor rW3,rW3,rW7
+ LBD(rW0)
+ evxor rW3,rW3,rW1
+ DAD(rD0, 1)
+ evxor rD3,rD3,rW3
+ LBD(rW6)
+ evxor rD3,rD3,rW5
+ DAD(rD0, 0)
+ evmergehi rD2,rD2,rD3
+ LBD(rW3)
+ LAD(rW2, rD3, 0)
+ LAD(rW1, rD2, 0)
+ LAD(rW4, rD2, 1)
+ LAD(rW5, rD3, 1)
+ LAD(rW7, rD1, 1)
+ rlwimi rW0,rW4,8,16,23
+ rlwimi rW1,rW5,8,16,23
+ LAD(rW4, rD3, 2)
+ LAD(rW5, rD0, 2)
+ rlwimi rW2,rW6,8,16,23
+ rlwimi rW3,rW7,8,16,23
+ LAD(rW6, rD1, 2)
+ LAD(rW7, rD2, 2)
+ rlwimi rW0,rW4,16,8,15
+ rlwimi rW1,rW5,16,8,15
+ LAD(rW4, rD0, 3)
+ LAD(rW5, rD1, 3)
+ rlwimi rW2,rW6,16,8,15
+ lwz rD0,32(rKP)
+ rlwimi rW3,rW7,16,8,15
+ lwz rD1,36(rKP)
+ LAD(rW6, rD2, 3)
+ LAD(rW7, rD3, 3)
+ rlwimi rW0,rW4,24,0,7
+ lwz rD2,40(rKP)
+ rlwimi rW1,rW5,24,0,7
+ lwz rD3,44(rKP)
+ rlwimi rW2,rW6,24,0,7
+ rlwimi rW3,rW7,24,0,7
+ blr
diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c
new file mode 100644
index 000000000..748fc00c5
--- /dev/null
+++ b/arch/powerpc/crypto/aes-spe-glue.c
@@ -0,0 +1,519 @@
+/*
+ * Glue code for AES implementation for SPE instructions (PPC)
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/aes.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <asm/byteorder.h>
+#include <asm/switch_to.h>
+#include <crypto/algapi.h>
+#include <crypto/xts.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). e500 cores can issue two
+ * instructions per clock cycle using one 32/64 bit unit (SU1) and one 32
+ * bit unit (SU2). One of these can be a memory access that is executed via
+ * a single load and store unit (LSU). XTS-AES-256 takes ~780 operations per
+ * 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data
+ * will need an estimated maximum of 20,000 cycles. Headroom for cache misses
+ * included. Even with the low end model clocked at 667 MHz this equals to a
+ * critical time window of less than 30us. The value has been chosen to
+ * process a 512 byte disk block in one or a large 1400 bytes IPsec network
+ * packet in two runs.
+ *
+ */
+#define MAX_BYTES 768
+
+struct ppc_aes_ctx {
+ u32 key_enc[AES_MAX_KEYLENGTH_U32];
+ u32 key_dec[AES_MAX_KEYLENGTH_U32];
+ u32 rounds;
+};
+
+struct ppc_xts_ctx {
+ u32 key_enc[AES_MAX_KEYLENGTH_U32];
+ u32 key_dec[AES_MAX_KEYLENGTH_U32];
+ u32 key_twk[AES_MAX_KEYLENGTH_U32];
+ u32 rounds;
+};
+
+extern void ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc, u32 rounds);
+extern void ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec, u32 rounds);
+extern void ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc, u32 rounds,
+ u32 bytes);
+extern void ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec, u32 rounds,
+ u32 bytes);
+extern void ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc, u32 rounds,
+ u32 bytes, u8 *iv);
+extern void ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec, u32 rounds,
+ u32 bytes, u8 *iv);
+extern void ppc_crypt_ctr (u8 *out, const u8 *in, u32 *key_enc, u32 rounds,
+ u32 bytes, u8 *iv);
+extern void ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc, u32 rounds,
+ u32 bytes, u8 *iv, u32 *key_twk);
+extern void ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec, u32 rounds,
+ u32 bytes, u8 *iv, u32 *key_twk);
+
+extern void ppc_expand_key_128(u32 *key_enc, const u8 *key);
+extern void ppc_expand_key_192(u32 *key_enc, const u8 *key);
+extern void ppc_expand_key_256(u32 *key_enc, const u8 *key);
+
+extern void ppc_generate_decrypt_key(u32 *key_dec,u32 *key_enc,
+ unsigned int key_len);
+
+static void spe_begin(void)
+{
+ /* disable preemption and save users SPE registers if required */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (key_len != AES_KEYSIZE_128 &&
+ key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ switch (key_len) {
+ case AES_KEYSIZE_128:
+ ctx->rounds = 4;
+ ppc_expand_key_128(ctx->key_enc, in_key);
+ break;
+ case AES_KEYSIZE_192:
+ ctx->rounds = 5;
+ ppc_expand_key_192(ctx->key_enc, in_key);
+ break;
+ case AES_KEYSIZE_256:
+ ctx->rounds = 6;
+ ppc_expand_key_256(ctx->key_enc, in_key);
+ break;
+ }
+
+ ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len);
+
+ return 0;
+}
+
+static int ppc_xts_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct ppc_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ int err;
+
+ err = xts_check_key(tfm, in_key, key_len);
+ if (err)
+ return err;
+
+ key_len >>= 1;
+
+ if (key_len != AES_KEYSIZE_128 &&
+ key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ switch (key_len) {
+ case AES_KEYSIZE_128:
+ ctx->rounds = 4;
+ ppc_expand_key_128(ctx->key_enc, in_key);
+ ppc_expand_key_128(ctx->key_twk, in_key + AES_KEYSIZE_128);
+ break;
+ case AES_KEYSIZE_192:
+ ctx->rounds = 5;
+ ppc_expand_key_192(ctx->key_enc, in_key);
+ ppc_expand_key_192(ctx->key_twk, in_key + AES_KEYSIZE_192);
+ break;
+ case AES_KEYSIZE_256:
+ ctx->rounds = 6;
+ ppc_expand_key_256(ctx->key_enc, in_key);
+ ppc_expand_key_256(ctx->key_twk, in_key + AES_KEYSIZE_256);
+ break;
+ }
+
+ ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len);
+
+ return 0;
+}
+
+static void ppc_aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ spe_begin();
+ ppc_encrypt_aes(out, in, ctx->key_enc, ctx->rounds);
+ spe_end();
+}
+
+static void ppc_aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ spe_begin();
+ ppc_decrypt_aes(out, in, ctx->key_dec, ctx->rounds);
+ spe_end();
+}
+
+static int ppc_ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_encrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, ctx->rounds, nbytes);
+ spe_end();
+
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_decrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, ctx->rounds, nbytes);
+ spe_end();
+
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_encrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, ctx->rounds, nbytes, walk.iv);
+ spe_end();
+
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_decrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, ctx->rounds, nbytes, walk.iv);
+ spe_end();
+
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int pbytes, ubytes;
+ int err;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+
+ while ((pbytes = walk.nbytes)) {
+ pbytes = pbytes > MAX_BYTES ? MAX_BYTES : pbytes;
+ pbytes = pbytes == nbytes ?
+ nbytes : pbytes & ~(AES_BLOCK_SIZE - 1);
+ ubytes = walk.nbytes - pbytes;
+
+ spe_begin();
+ ppc_crypt_ctr(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, ctx->rounds, pbytes , walk.iv);
+ spe_end();
+
+ nbytes -= pbytes;
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+ u32 *twk;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+ twk = ctx->key_twk;
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_encrypt_xts(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, ctx->rounds, nbytes, walk.iv, twk);
+ spe_end();
+
+ twk = NULL;
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+static int ppc_xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ unsigned int ubytes;
+ int err;
+ u32 *twk;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+ twk = ctx->key_twk;
+
+ while ((nbytes = walk.nbytes)) {
+ ubytes = nbytes > MAX_BYTES ?
+ nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1);
+ nbytes -= ubytes;
+
+ spe_begin();
+ ppc_decrypt_xts(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, ctx->rounds, nbytes, walk.iv, twk);
+ spe_end();
+
+ twk = NULL;
+ err = blkcipher_walk_done(desc, &walk, ubytes);
+ }
+
+ return err;
+}
+
+/*
+ * Algorithm definitions. Disabling alignment (cra_alignmask=0) was chosen
+ * because the e500 platform can handle unaligned reads/writes very efficently.
+ * This improves IPsec thoughput by another few percent. Additionally we assume
+ * that AES context is always aligned to at least 8 bytes because it is created
+ * with kmalloc() in the crypto infrastructure
+ *
+ */
+static struct crypto_alg aes_algs[] = { {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-ppc-spe",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ppc_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = ppc_aes_setkey,
+ .cia_encrypt = ppc_aes_encrypt,
+ .cia_decrypt = ppc_aes_decrypt
+ }
+ }
+}, {
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-ppc-spe",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ppc_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ppc_aes_setkey,
+ .encrypt = ppc_ecb_encrypt,
+ .decrypt = ppc_ecb_decrypt,
+ }
+ }
+}, {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-ppc-spe",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ppc_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ppc_aes_setkey,
+ .encrypt = ppc_cbc_encrypt,
+ .decrypt = ppc_cbc_decrypt,
+ }
+ }
+}, {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-ppc-spe",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct ppc_aes_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ppc_aes_setkey,
+ .encrypt = ppc_ctr_crypt,
+ .decrypt = ppc_ctr_crypt,
+ }
+ }
+}, {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-ppc-spe",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ppc_xts_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE * 2,
+ .max_keysize = AES_MAX_KEY_SIZE * 2,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ppc_xts_setkey,
+ .encrypt = ppc_xts_encrypt,
+ .decrypt = ppc_xts_decrypt,
+ }
+ }
+} };
+
+static int __init ppc_aes_mod_init(void)
+{
+ return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static void __exit ppc_aes_mod_fini(void)
+{
+ crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+module_init(ppc_aes_mod_init);
+module_exit(ppc_aes_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS, SPE optimized");
+
+MODULE_ALIAS_CRYPTO("aes");
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("aes-ppc-spe");
diff --git a/arch/powerpc/crypto/aes-spe-keys.S b/arch/powerpc/crypto/aes-spe-keys.S
new file mode 100644
index 000000000..be8090f3d
--- /dev/null
+++ b/arch/powerpc/crypto/aes-spe-keys.S
@@ -0,0 +1,283 @@
+/*
+ * Key handling functions for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_KEY(d, s, off) \
+ lwz d,off(s);
+#else
+#define LOAD_KEY(d, s, off) \
+ li r0,off; \
+ lwbrx d,s,r0;
+#endif
+
+#define INITIALIZE_KEY \
+ stwu r1,-32(r1); /* create stack frame */ \
+ stw r14,8(r1); /* save registers */ \
+ stw r15,12(r1); \
+ stw r16,16(r1);
+
+#define FINALIZE_KEY \
+ lwz r14,8(r1); /* restore registers */ \
+ lwz r15,12(r1); \
+ lwz r16,16(r1); \
+ xor r5,r5,r5; /* clear sensitive data */ \
+ xor r6,r6,r6; \
+ xor r7,r7,r7; \
+ xor r8,r8,r8; \
+ xor r9,r9,r9; \
+ xor r10,r10,r10; \
+ xor r11,r11,r11; \
+ xor r12,r12,r12; \
+ addi r1,r1,32; /* cleanup stack */
+
+#define LS_BOX(r, t1, t2) \
+ lis t2,PPC_AES_4K_ENCTAB@h; \
+ ori t2,t2,PPC_AES_4K_ENCTAB@l; \
+ rlwimi t2,r,4,20,27; \
+ lbz t1,8(t2); \
+ rlwimi r,t1,0,24,31; \
+ rlwimi t2,r,28,20,27; \
+ lbz t1,8(t2); \
+ rlwimi r,t1,8,16,23; \
+ rlwimi t2,r,20,20,27; \
+ lbz t1,8(t2); \
+ rlwimi r,t1,16,8,15; \
+ rlwimi t2,r,12,20,27; \
+ lbz t1,8(t2); \
+ rlwimi r,t1,24,0,7;
+
+#define GF8_MUL(out, in, t1, t2) \
+ lis t1,0x8080; /* multiplication in GF8 */ \
+ ori t1,t1,0x8080; \
+ and t1,t1,in; \
+ srwi t1,t1,7; \
+ mulli t1,t1,0x1b; \
+ lis t2,0x7f7f; \
+ ori t2,t2,0x7f7f; \
+ and t2,t2,in; \
+ slwi t2,t2,1; \
+ xor out,t1,t2;
+
+/*
+ * ppc_expand_key_128(u32 *key_enc, const u8 *key)
+ *
+ * Expand 128 bit key into 176 bytes encryption key. It consists of
+ * key itself plus 10 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_128)
+ INITIALIZE_KEY
+ LOAD_KEY(r5,r4,0)
+ LOAD_KEY(r6,r4,4)
+ LOAD_KEY(r7,r4,8)
+ LOAD_KEY(r8,r4,12)
+ stw r5,0(r3) /* key[0..3] = input data */
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ li r16,10 /* 10 expansion rounds */
+ lis r0,0x0100 /* RCO(1) */
+ppc_expand_128_loop:
+ addi r3,r3,16
+ mr r14,r8 /* apply LS_BOX to 4th temp */
+ rotlwi r14,r14,8
+ LS_BOX(r14, r15, r4)
+ xor r14,r14,r0
+ xor r5,r5,r14 /* xor next 4 keys */
+ xor r6,r6,r5
+ xor r7,r7,r6
+ xor r8,r8,r7
+ stw r5,0(r3) /* store next 4 keys */
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ GF8_MUL(r0, r0, r4, r14) /* multiply RCO by 2 in GF */
+ subi r16,r16,1
+ cmpwi r16,0
+ bt eq,ppc_expand_128_end
+ b ppc_expand_128_loop
+ppc_expand_128_end:
+ FINALIZE_KEY
+ blr
+
+/*
+ * ppc_expand_key_192(u32 *key_enc, const u8 *key)
+ *
+ * Expand 192 bit key into 208 bytes encryption key. It consists of key
+ * itself plus 12 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_192)
+ INITIALIZE_KEY
+ LOAD_KEY(r5,r4,0)
+ LOAD_KEY(r6,r4,4)
+ LOAD_KEY(r7,r4,8)
+ LOAD_KEY(r8,r4,12)
+ LOAD_KEY(r9,r4,16)
+ LOAD_KEY(r10,r4,20)
+ stw r5,0(r3)
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ stw r9,16(r3)
+ stw r10,20(r3)
+ li r16,8 /* 8 expansion rounds */
+ lis r0,0x0100 /* RCO(1) */
+ppc_expand_192_loop:
+ addi r3,r3,24
+ mr r14,r10 /* apply LS_BOX to 6th temp */
+ rotlwi r14,r14,8
+ LS_BOX(r14, r15, r4)
+ xor r14,r14,r0
+ xor r5,r5,r14 /* xor next 6 keys */
+ xor r6,r6,r5
+ xor r7,r7,r6
+ xor r8,r8,r7
+ xor r9,r9,r8
+ xor r10,r10,r9
+ stw r5,0(r3)
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ subi r16,r16,1
+ cmpwi r16,0 /* last round early kick out */
+ bt eq,ppc_expand_192_end
+ stw r9,16(r3)
+ stw r10,20(r3)
+ GF8_MUL(r0, r0, r4, r14) /* multiply RCO GF8 */
+ b ppc_expand_192_loop
+ppc_expand_192_end:
+ FINALIZE_KEY
+ blr
+
+/*
+ * ppc_expand_key_256(u32 *key_enc, const u8 *key)
+ *
+ * Expand 256 bit key into 240 bytes encryption key. It consists of key
+ * itself plus 14 rounds with 16 bytes each
+ *
+ */
+_GLOBAL(ppc_expand_key_256)
+ INITIALIZE_KEY
+ LOAD_KEY(r5,r4,0)
+ LOAD_KEY(r6,r4,4)
+ LOAD_KEY(r7,r4,8)
+ LOAD_KEY(r8,r4,12)
+ LOAD_KEY(r9,r4,16)
+ LOAD_KEY(r10,r4,20)
+ LOAD_KEY(r11,r4,24)
+ LOAD_KEY(r12,r4,28)
+ stw r5,0(r3)
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ stw r9,16(r3)
+ stw r10,20(r3)
+ stw r11,24(r3)
+ stw r12,28(r3)
+ li r16,7 /* 7 expansion rounds */
+ lis r0,0x0100 /* RCO(1) */
+ppc_expand_256_loop:
+ addi r3,r3,32
+ mr r14,r12 /* apply LS_BOX to 8th temp */
+ rotlwi r14,r14,8
+ LS_BOX(r14, r15, r4)
+ xor r14,r14,r0
+ xor r5,r5,r14 /* xor 4 keys */
+ xor r6,r6,r5
+ xor r7,r7,r6
+ xor r8,r8,r7
+ mr r14,r8
+ LS_BOX(r14, r15, r4) /* apply LS_BOX to 4th temp */
+ xor r9,r9,r14 /* xor 4 keys */
+ xor r10,r10,r9
+ xor r11,r11,r10
+ xor r12,r12,r11
+ stw r5,0(r3)
+ stw r6,4(r3)
+ stw r7,8(r3)
+ stw r8,12(r3)
+ subi r16,r16,1
+ cmpwi r16,0 /* last round early kick out */
+ bt eq,ppc_expand_256_end
+ stw r9,16(r3)
+ stw r10,20(r3)
+ stw r11,24(r3)
+ stw r12,28(r3)
+ GF8_MUL(r0, r0, r4, r14)
+ b ppc_expand_256_loop
+ppc_expand_256_end:
+ FINALIZE_KEY
+ blr
+
+/*
+ * ppc_generate_decrypt_key: derive decryption key from encryption key
+ * number of bytes to handle are calculated from length of key (16/24/32)
+ *
+ */
+_GLOBAL(ppc_generate_decrypt_key)
+ addi r6,r5,24
+ slwi r6,r6,2
+ lwzx r7,r4,r6 /* first/last 4 words are same */
+ stw r7,0(r3)
+ lwz r7,0(r4)
+ stwx r7,r3,r6
+ addi r6,r6,4
+ lwzx r7,r4,r6
+ stw r7,4(r3)
+ lwz r7,4(r4)
+ stwx r7,r3,r6
+ addi r6,r6,4
+ lwzx r7,r4,r6
+ stw r7,8(r3)
+ lwz r7,8(r4)
+ stwx r7,r3,r6
+ addi r6,r6,4
+ lwzx r7,r4,r6
+ stw r7,12(r3)
+ lwz r7,12(r4)
+ stwx r7,r3,r6
+ addi r3,r3,16
+ add r4,r4,r6
+ subi r4,r4,28
+ addi r5,r5,20
+ srwi r5,r5,2
+ppc_generate_decrypt_block:
+ li r6,4
+ mtctr r6
+ppc_generate_decrypt_word:
+ lwz r6,0(r4)
+ GF8_MUL(r7, r6, r0, r7)
+ GF8_MUL(r8, r7, r0, r8)
+ GF8_MUL(r9, r8, r0, r9)
+ xor r10,r9,r6
+ xor r11,r7,r8
+ xor r11,r11,r9
+ xor r12,r7,r10
+ rotrwi r12,r12,24
+ xor r11,r11,r12
+ xor r12,r8,r10
+ rotrwi r12,r12,16
+ xor r11,r11,r12
+ rotrwi r12,r10,8
+ xor r11,r11,r12
+ stw r11,0(r3)
+ addi r3,r3,4
+ addi r4,r4,4
+ bdnz ppc_generate_decrypt_word
+ subi r4,r4,32
+ subi r5,r5,1
+ cmpwi r5,0
+ bt gt,ppc_generate_decrypt_block
+ blr
diff --git a/arch/powerpc/crypto/aes-spe-modes.S b/arch/powerpc/crypto/aes-spe-modes.S
new file mode 100644
index 000000000..ad48032ca
--- /dev/null
+++ b/arch/powerpc/crypto/aes-spe-modes.S
@@ -0,0 +1,630 @@
+/*
+ * AES modes (ECB/CBC/CTR/XTS) for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+#include "aes-spe-regs.h"
+
+#ifdef __BIG_ENDIAN__ /* Macros for big endian builds */
+
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rSP); /* load with offset */
+#define SAVE_DATA(reg, off) \
+ stw reg,off(rDP); /* save with offset */
+#define NEXT_BLOCK \
+ addi rSP,rSP,16; /* increment pointers per bloc */ \
+ addi rDP,rDP,16;
+#define LOAD_IV(reg, off) \
+ lwz reg,off(rIP); /* IV loading with offset */
+#define SAVE_IV(reg, off) \
+ stw reg,off(rIP); /* IV saving with offset */
+#define START_IV /* nothing to reset */
+#define CBC_DEC 16 /* CBC decrement per block */
+#define CTR_DEC 1 /* CTR decrement one byte */
+
+#else /* Macros for little endian */
+
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rSP; /* load reversed */ \
+ addi rSP,rSP,4; /* and increment pointer */
+#define SAVE_DATA(reg, off) \
+ stwbrx reg,0,rDP; /* save reversed */ \
+ addi rDP,rDP,4; /* and increment pointer */
+#define NEXT_BLOCK /* nothing todo */
+#define LOAD_IV(reg, off) \
+ lwbrx reg,0,rIP; /* load reversed */ \
+ addi rIP,rIP,4; /* and increment pointer */
+#define SAVE_IV(reg, off) \
+ stwbrx reg,0,rIP; /* load reversed */ \
+ addi rIP,rIP,4; /* and increment pointer */
+#define START_IV \
+ subi rIP,rIP,16; /* must reset pointer */
+#define CBC_DEC 32 /* 2 blocks because of incs */
+#define CTR_DEC 17 /* 1 block because of incs */
+
+#endif
+
+#define SAVE_0_REGS
+#define LOAD_0_REGS
+
+#define SAVE_4_REGS \
+ stw rI0,96(r1); /* save 32 bit registers */ \
+ stw rI1,100(r1); \
+ stw rI2,104(r1); \
+ stw rI3,108(r1);
+
+#define LOAD_4_REGS \
+ lwz rI0,96(r1); /* restore 32 bit registers */ \
+ lwz rI1,100(r1); \
+ lwz rI2,104(r1); \
+ lwz rI3,108(r1);
+
+#define SAVE_8_REGS \
+ SAVE_4_REGS \
+ stw rG0,112(r1); /* save 32 bit registers */ \
+ stw rG1,116(r1); \
+ stw rG2,120(r1); \
+ stw rG3,124(r1);
+
+#define LOAD_8_REGS \
+ LOAD_4_REGS \
+ lwz rG0,112(r1); /* restore 32 bit registers */ \
+ lwz rG1,116(r1); \
+ lwz rG2,120(r1); \
+ lwz rG3,124(r1);
+
+#define INITIALIZE_CRYPT(tab,nr32bitregs) \
+ mflr r0; \
+ stwu r1,-160(r1); /* create stack frame */ \
+ lis rT0,tab@h; /* en-/decryption table pointer */ \
+ stw r0,8(r1); /* save link register */ \
+ ori rT0,rT0,tab@l; \
+ evstdw r14,16(r1); \
+ mr rKS,rKP; \
+ evstdw r15,24(r1); /* We must save non volatile */ \
+ evstdw r16,32(r1); /* registers. Take the chance */ \
+ evstdw r17,40(r1); /* and save the SPE part too */ \
+ evstdw r18,48(r1); \
+ evstdw r19,56(r1); \
+ evstdw r20,64(r1); \
+ evstdw r21,72(r1); \
+ evstdw r22,80(r1); \
+ evstdw r23,88(r1); \
+ SAVE_##nr32bitregs##_REGS
+
+#define FINALIZE_CRYPT(nr32bitregs) \
+ lwz r0,8(r1); \
+ evldw r14,16(r1); /* restore SPE registers */ \
+ evldw r15,24(r1); \
+ evldw r16,32(r1); \
+ evldw r17,40(r1); \
+ evldw r18,48(r1); \
+ evldw r19,56(r1); \
+ evldw r20,64(r1); \
+ evldw r21,72(r1); \
+ evldw r22,80(r1); \
+ evldw r23,88(r1); \
+ LOAD_##nr32bitregs##_REGS \
+ mtlr r0; /* restore link register */ \
+ xor r0,r0,r0; \
+ stw r0,16(r1); /* delete sensitive data */ \
+ stw r0,24(r1); /* that we might have pushed */ \
+ stw r0,32(r1); /* from other context that runs */ \
+ stw r0,40(r1); /* the same code */ \
+ stw r0,48(r1); \
+ stw r0,56(r1); \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ stw r0,88(r1); \
+ addi r1,r1,160; /* cleanup stack frame */
+
+#define ENDIAN_SWAP(t0, t1, s0, s1) \
+ rotrwi t0,s0,8; /* swap endianness for 2 GPRs */ \
+ rotrwi t1,s1,8; \
+ rlwimi t0,s0,8,8,15; \
+ rlwimi t1,s1,8,8,15; \
+ rlwimi t0,s0,8,24,31; \
+ rlwimi t1,s1,8,24,31;
+
+#define GF128_MUL(d0, d1, d2, d3, t0) \
+ li t0,0x87; /* multiplication in GF128 */ \
+ cmpwi d3,-1; \
+ iselgt t0,0,t0; \
+ rlwimi d3,d2,0,0,0; /* propagate "carry" bits */ \
+ rotlwi d3,d3,1; \
+ rlwimi d2,d1,0,0,0; \
+ rotlwi d2,d2,1; \
+ rlwimi d1,d0,0,0,0; \
+ slwi d0,d0,1; /* shift left 128 bit */ \
+ rotlwi d1,d1,1; \
+ xor d0,d0,t0;
+
+#define START_KEY(d0, d1, d2, d3) \
+ lwz rW0,0(rKP); \
+ mtctr rRR; \
+ lwz rW1,4(rKP); \
+ lwz rW2,8(rKP); \
+ lwz rW3,12(rKP); \
+ xor rD0,d0,rW0; \
+ xor rD1,d1,rW1; \
+ xor rD2,d2,rW2; \
+ xor rD3,d3,rW3;
+
+/*
+ * ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc,
+ * u32 rounds)
+ *
+ * called from glue layer to encrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_aes)
+ INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+ LOAD_DATA(rD0, 0)
+ LOAD_DATA(rD1, 4)
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_encrypt_block
+ xor rD0,rD0,rW0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rW1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rW2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rW3
+ SAVE_DATA(rD3, 12)
+ FINALIZE_CRYPT(0)
+ blr
+
+/*
+ * ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec,
+ * u32 rounds)
+ *
+ * called from glue layer to decrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_aes)
+ INITIALIZE_CRYPT(PPC_AES_4K_DECTAB,0)
+ LOAD_DATA(rD0, 0)
+ addi rT1,rT0,4096
+ LOAD_DATA(rD1, 4)
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_decrypt_block
+ xor rD0,rD0,rW0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rW1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rW2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rW3
+ SAVE_DATA(rD3, 12)
+ FINALIZE_CRYPT(0)
+ blr
+
+/*
+ * ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc,
+ * u32 rounds, u32 bytes);
+ *
+ * called from glue layer to encrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_ecb)
+ INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+ppc_encrypt_ecb_loop:
+ LOAD_DATA(rD0, 0)
+ mr rKP,rKS
+ LOAD_DATA(rD1, 4)
+ subi rLN,rLN,16
+ LOAD_DATA(rD2, 8)
+ cmpwi rLN,15
+ LOAD_DATA(rD3, 12)
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_encrypt_block
+ xor rD0,rD0,rW0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rW1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rW2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rW3
+ SAVE_DATA(rD3, 12)
+ NEXT_BLOCK
+ bt gt,ppc_encrypt_ecb_loop
+ FINALIZE_CRYPT(0)
+ blr
+
+/*
+ * ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec,
+ * u32 rounds, u32 bytes);
+ *
+ * called from glue layer to decrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_ecb)
+ INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 0)
+ addi rT1,rT0,4096
+ppc_decrypt_ecb_loop:
+ LOAD_DATA(rD0, 0)
+ mr rKP,rKS
+ LOAD_DATA(rD1, 4)
+ subi rLN,rLN,16
+ LOAD_DATA(rD2, 8)
+ cmpwi rLN,15
+ LOAD_DATA(rD3, 12)
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_decrypt_block
+ xor rD0,rD0,rW0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rW1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rW2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rW3
+ SAVE_DATA(rD3, 12)
+ NEXT_BLOCK
+ bt gt,ppc_decrypt_ecb_loop
+ FINALIZE_CRYPT(0)
+ blr
+
+/*
+ * ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc,
+ * 32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt multiple blocks via CBC
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_cbc)
+ INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+ LOAD_IV(rI0, 0)
+ LOAD_IV(rI1, 4)
+ LOAD_IV(rI2, 8)
+ LOAD_IV(rI3, 12)
+ppc_encrypt_cbc_loop:
+ LOAD_DATA(rD0, 0)
+ mr rKP,rKS
+ LOAD_DATA(rD1, 4)
+ subi rLN,rLN,16
+ LOAD_DATA(rD2, 8)
+ cmpwi rLN,15
+ LOAD_DATA(rD3, 12)
+ xor rD0,rD0,rI0
+ xor rD1,rD1,rI1
+ xor rD2,rD2,rI2
+ xor rD3,rD3,rI3
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_encrypt_block
+ xor rI0,rD0,rW0
+ SAVE_DATA(rI0, 0)
+ xor rI1,rD1,rW1
+ SAVE_DATA(rI1, 4)
+ xor rI2,rD2,rW2
+ SAVE_DATA(rI2, 8)
+ xor rI3,rD3,rW3
+ SAVE_DATA(rI3, 12)
+ NEXT_BLOCK
+ bt gt,ppc_encrypt_cbc_loop
+ START_IV
+ SAVE_IV(rI0, 0)
+ SAVE_IV(rI1, 4)
+ SAVE_IV(rI2, 8)
+ SAVE_IV(rI3, 12)
+ FINALIZE_CRYPT(4)
+ blr
+
+/*
+ * ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec,
+ * u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to decrypt multiple blocks via CBC
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_cbc)
+ INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 4)
+ li rT1,15
+ LOAD_IV(rI0, 0)
+ andc rLN,rLN,rT1
+ LOAD_IV(rI1, 4)
+ subi rLN,rLN,16
+ LOAD_IV(rI2, 8)
+ add rSP,rSP,rLN /* reverse processing */
+ LOAD_IV(rI3, 12)
+ add rDP,rDP,rLN
+ LOAD_DATA(rD0, 0)
+ addi rT1,rT0,4096
+ LOAD_DATA(rD1, 4)
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ START_IV
+ SAVE_IV(rD0, 0)
+ SAVE_IV(rD1, 4)
+ SAVE_IV(rD2, 8)
+ cmpwi rLN,16
+ SAVE_IV(rD3, 12)
+ bt lt,ppc_decrypt_cbc_end
+ppc_decrypt_cbc_loop:
+ mr rKP,rKS
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_decrypt_block
+ subi rLN,rLN,16
+ subi rSP,rSP,CBC_DEC
+ xor rW0,rD0,rW0
+ LOAD_DATA(rD0, 0)
+ xor rW1,rD1,rW1
+ LOAD_DATA(rD1, 4)
+ xor rW2,rD2,rW2
+ LOAD_DATA(rD2, 8)
+ xor rW3,rD3,rW3
+ LOAD_DATA(rD3, 12)
+ xor rW0,rW0,rD0
+ SAVE_DATA(rW0, 0)
+ xor rW1,rW1,rD1
+ SAVE_DATA(rW1, 4)
+ xor rW2,rW2,rD2
+ SAVE_DATA(rW2, 8)
+ xor rW3,rW3,rD3
+ SAVE_DATA(rW3, 12)
+ cmpwi rLN,15
+ subi rDP,rDP,CBC_DEC
+ bt gt,ppc_decrypt_cbc_loop
+ppc_decrypt_cbc_end:
+ mr rKP,rKS
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_decrypt_block
+ xor rW0,rW0,rD0
+ xor rW1,rW1,rD1
+ xor rW2,rW2,rD2
+ xor rW3,rW3,rD3
+ xor rW0,rW0,rI0 /* decrypt with initial IV */
+ SAVE_DATA(rW0, 0)
+ xor rW1,rW1,rI1
+ SAVE_DATA(rW1, 4)
+ xor rW2,rW2,rI2
+ SAVE_DATA(rW2, 8)
+ xor rW3,rW3,rI3
+ SAVE_DATA(rW3, 12)
+ FINALIZE_CRYPT(4)
+ blr
+
+/*
+ * ppc_crypt_ctr(u8 *out, const u8 *in, u32 *key_enc,
+ * u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt/decrypt multiple blocks
+ * via CTR. Number of bytes does not need to be a multiple of
+ * 16. Round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_crypt_ctr)
+ INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+ LOAD_IV(rI0, 0)
+ LOAD_IV(rI1, 4)
+ LOAD_IV(rI2, 8)
+ cmpwi rLN,16
+ LOAD_IV(rI3, 12)
+ START_IV
+ bt lt,ppc_crypt_ctr_partial
+ppc_crypt_ctr_loop:
+ mr rKP,rKS
+ START_KEY(rI0, rI1, rI2, rI3)
+ bl ppc_encrypt_block
+ xor rW0,rD0,rW0
+ xor rW1,rD1,rW1
+ xor rW2,rD2,rW2
+ xor rW3,rD3,rW3
+ LOAD_DATA(rD0, 0)
+ subi rLN,rLN,16
+ LOAD_DATA(rD1, 4)
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ xor rD0,rD0,rW0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rW1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rW2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rW3
+ SAVE_DATA(rD3, 12)
+ addic rI3,rI3,1 /* increase counter */
+ addze rI2,rI2
+ addze rI1,rI1
+ addze rI0,rI0
+ NEXT_BLOCK
+ cmpwi rLN,15
+ bt gt,ppc_crypt_ctr_loop
+ppc_crypt_ctr_partial:
+ cmpwi rLN,0
+ bt eq,ppc_crypt_ctr_end
+ mr rKP,rKS
+ START_KEY(rI0, rI1, rI2, rI3)
+ bl ppc_encrypt_block
+ xor rW0,rD0,rW0
+ SAVE_IV(rW0, 0)
+ xor rW1,rD1,rW1
+ SAVE_IV(rW1, 4)
+ xor rW2,rD2,rW2
+ SAVE_IV(rW2, 8)
+ xor rW3,rD3,rW3
+ SAVE_IV(rW3, 12)
+ mtctr rLN
+ subi rIP,rIP,CTR_DEC
+ subi rSP,rSP,1
+ subi rDP,rDP,1
+ppc_crypt_ctr_xorbyte:
+ lbzu rW4,1(rIP) /* bytewise xor for partial block */
+ lbzu rW5,1(rSP)
+ xor rW4,rW4,rW5
+ stbu rW4,1(rDP)
+ bdnz ppc_crypt_ctr_xorbyte
+ subf rIP,rLN,rIP
+ addi rIP,rIP,1
+ addic rI3,rI3,1
+ addze rI2,rI2
+ addze rI1,rI1
+ addze rI0,rI0
+ppc_crypt_ctr_end:
+ SAVE_IV(rI0, 0)
+ SAVE_IV(rI1, 4)
+ SAVE_IV(rI2, 8)
+ SAVE_IV(rI3, 12)
+ FINALIZE_CRYPT(4)
+ blr
+
+/*
+ * ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc,
+ * u32 rounds, u32 bytes, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to encrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_xts)
+ INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 8)
+ LOAD_IV(rI0, 0)
+ LOAD_IV(rI1, 4)
+ LOAD_IV(rI2, 8)
+ cmpwi rKT,0
+ LOAD_IV(rI3, 12)
+ bt eq,ppc_encrypt_xts_notweak
+ mr rKP,rKT
+ START_KEY(rI0, rI1, rI2, rI3)
+ bl ppc_encrypt_block
+ xor rI0,rD0,rW0
+ xor rI1,rD1,rW1
+ xor rI2,rD2,rW2
+ xor rI3,rD3,rW3
+ppc_encrypt_xts_notweak:
+ ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+ ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_encrypt_xts_loop:
+ LOAD_DATA(rD0, 0)
+ mr rKP,rKS
+ LOAD_DATA(rD1, 4)
+ subi rLN,rLN,16
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ xor rD0,rD0,rI0
+ xor rD1,rD1,rI1
+ xor rD2,rD2,rI2
+ xor rD3,rD3,rI3
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_encrypt_block
+ xor rD0,rD0,rW0
+ xor rD1,rD1,rW1
+ xor rD2,rD2,rW2
+ xor rD3,rD3,rW3
+ xor rD0,rD0,rI0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rI1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rI2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rI3
+ SAVE_DATA(rD3, 12)
+ GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+ ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+ ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+ cmpwi rLN,0
+ NEXT_BLOCK
+ bt gt,ppc_encrypt_xts_loop
+ START_IV
+ SAVE_IV(rI0, 0)
+ SAVE_IV(rI1, 4)
+ SAVE_IV(rI2, 8)
+ SAVE_IV(rI3, 12)
+ FINALIZE_CRYPT(8)
+ blr
+
+/*
+ * ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec,
+ * u32 rounds, u32 blocks, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to decrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_xts)
+ INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 8)
+ LOAD_IV(rI0, 0)
+ addi rT1,rT0,4096
+ LOAD_IV(rI1, 4)
+ LOAD_IV(rI2, 8)
+ cmpwi rKT,0
+ LOAD_IV(rI3, 12)
+ bt eq,ppc_decrypt_xts_notweak
+ subi rT0,rT0,4096
+ mr rKP,rKT
+ START_KEY(rI0, rI1, rI2, rI3)
+ bl ppc_encrypt_block
+ xor rI0,rD0,rW0
+ xor rI1,rD1,rW1
+ xor rI2,rD2,rW2
+ xor rI3,rD3,rW3
+ addi rT0,rT0,4096
+ppc_decrypt_xts_notweak:
+ ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+ ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_decrypt_xts_loop:
+ LOAD_DATA(rD0, 0)
+ mr rKP,rKS
+ LOAD_DATA(rD1, 4)
+ subi rLN,rLN,16
+ LOAD_DATA(rD2, 8)
+ LOAD_DATA(rD3, 12)
+ xor rD0,rD0,rI0
+ xor rD1,rD1,rI1
+ xor rD2,rD2,rI2
+ xor rD3,rD3,rI3
+ START_KEY(rD0, rD1, rD2, rD3)
+ bl ppc_decrypt_block
+ xor rD0,rD0,rW0
+ xor rD1,rD1,rW1
+ xor rD2,rD2,rW2
+ xor rD3,rD3,rW3
+ xor rD0,rD0,rI0
+ SAVE_DATA(rD0, 0)
+ xor rD1,rD1,rI1
+ SAVE_DATA(rD1, 4)
+ xor rD2,rD2,rI2
+ SAVE_DATA(rD2, 8)
+ xor rD3,rD3,rI3
+ SAVE_DATA(rD3, 12)
+ GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+ ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+ ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+ cmpwi rLN,0
+ NEXT_BLOCK
+ bt gt,ppc_decrypt_xts_loop
+ START_IV
+ SAVE_IV(rI0, 0)
+ SAVE_IV(rI1, 4)
+ SAVE_IV(rI2, 8)
+ SAVE_IV(rI3, 12)
+ FINALIZE_CRYPT(8)
+ blr
diff --git a/arch/powerpc/crypto/aes-spe-regs.h b/arch/powerpc/crypto/aes-spe-regs.h
new file mode 100644
index 000000000..2cc3a2caa
--- /dev/null
+++ b/arch/powerpc/crypto/aes-spe-regs.h
@@ -0,0 +1,42 @@
+/*
+ * Common registers for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#define rKS r0 /* copy of en-/decryption key pointer */
+#define rDP r3 /* destination pointer */
+#define rSP r4 /* source pointer */
+#define rKP r5 /* pointer to en-/decryption key pointer */
+#define rRR r6 /* en-/decryption rounds */
+#define rLN r7 /* length of data to be processed */
+#define rIP r8 /* potiner to IV (CBC/CTR/XTS modes) */
+#define rKT r9 /* pointer to tweak key (XTS mode) */
+#define rT0 r11 /* pointers to en-/decryption tables */
+#define rT1 r10
+#define rD0 r9 /* data */
+#define rD1 r14
+#define rD2 r12
+#define rD3 r15
+#define rW0 r16 /* working registers */
+#define rW1 r17
+#define rW2 r18
+#define rW3 r19
+#define rW4 r20
+#define rW5 r21
+#define rW6 r22
+#define rW7 r23
+#define rI0 r24 /* IV */
+#define rI1 r25
+#define rI2 r26
+#define rI3 r27
+#define rG0 r28 /* endian reversed tweak (XTS mode) */
+#define rG1 r29
+#define rG2 r30
+#define rG3 r31
diff --git a/arch/powerpc/crypto/aes-tab-4k.S b/arch/powerpc/crypto/aes-tab-4k.S
new file mode 100644
index 000000000..701e60240
--- /dev/null
+++ b/arch/powerpc/crypto/aes-tab-4k.S
@@ -0,0 +1,331 @@
+/*
+ * 4K AES tables for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+/*
+ * These big endian AES encryption/decryption tables have been taken from
+ * crypto/aes_generic.c and are designed to be simply accessed by a combination
+ * of rlwimi/lwz instructions with a minimum of table registers (usually only
+ * one required). Thus they are aligned to 4K. The locality of rotated values
+ * is derived from the reduced offsets that are available in the SPE load
+ * instructions. E.g. evldw, evlwwsplat, ...
+ *
+ * For the safety-conscious it has to be noted that they might be vulnerable
+ * to cache timing attacks because of their size. Nevertheless in contrast to
+ * the generic tables they have been reduced from 16KB to 8KB + 256 bytes.
+ * This is a quite good tradeoff for low power devices (e.g. routers) without
+ * dedicated encryption hardware where we usually have no multiuser
+ * environment.
+ *
+ */
+
+#define R(a, b, c, d) \
+ 0x##a##b##c##d, 0x##d##a##b##c, 0x##c##d##a##b, 0x##b##c##d##a
+
+.data
+.align 12
+.globl PPC_AES_4K_ENCTAB
+PPC_AES_4K_ENCTAB:
+/* encryption table, same as crypto_ft_tab in crypto/aes-generic.c */
+ .long R(c6, 63, 63, a5), R(f8, 7c, 7c, 84)
+ .long R(ee, 77, 77, 99), R(f6, 7b, 7b, 8d)
+ .long R(ff, f2, f2, 0d), R(d6, 6b, 6b, bd)
+ .long R(de, 6f, 6f, b1), R(91, c5, c5, 54)
+ .long R(60, 30, 30, 50), R(02, 01, 01, 03)
+ .long R(ce, 67, 67, a9), R(56, 2b, 2b, 7d)
+ .long R(e7, fe, fe, 19), R(b5, d7, d7, 62)
+ .long R(4d, ab, ab, e6), R(ec, 76, 76, 9a)
+ .long R(8f, ca, ca, 45), R(1f, 82, 82, 9d)
+ .long R(89, c9, c9, 40), R(fa, 7d, 7d, 87)
+ .long R(ef, fa, fa, 15), R(b2, 59, 59, eb)
+ .long R(8e, 47, 47, c9), R(fb, f0, f0, 0b)
+ .long R(41, ad, ad, ec), R(b3, d4, d4, 67)
+ .long R(5f, a2, a2, fd), R(45, af, af, ea)
+ .long R(23, 9c, 9c, bf), R(53, a4, a4, f7)
+ .long R(e4, 72, 72, 96), R(9b, c0, c0, 5b)
+ .long R(75, b7, b7, c2), R(e1, fd, fd, 1c)
+ .long R(3d, 93, 93, ae), R(4c, 26, 26, 6a)
+ .long R(6c, 36, 36, 5a), R(7e, 3f, 3f, 41)
+ .long R(f5, f7, f7, 02), R(83, cc, cc, 4f)
+ .long R(68, 34, 34, 5c), R(51, a5, a5, f4)
+ .long R(d1, e5, e5, 34), R(f9, f1, f1, 08)
+ .long R(e2, 71, 71, 93), R(ab, d8, d8, 73)
+ .long R(62, 31, 31, 53), R(2a, 15, 15, 3f)
+ .long R(08, 04, 04, 0c), R(95, c7, c7, 52)
+ .long R(46, 23, 23, 65), R(9d, c3, c3, 5e)
+ .long R(30, 18, 18, 28), R(37, 96, 96, a1)
+ .long R(0a, 05, 05, 0f), R(2f, 9a, 9a, b5)
+ .long R(0e, 07, 07, 09), R(24, 12, 12, 36)
+ .long R(1b, 80, 80, 9b), R(df, e2, e2, 3d)
+ .long R(cd, eb, eb, 26), R(4e, 27, 27, 69)
+ .long R(7f, b2, b2, cd), R(ea, 75, 75, 9f)
+ .long R(12, 09, 09, 1b), R(1d, 83, 83, 9e)
+ .long R(58, 2c, 2c, 74), R(34, 1a, 1a, 2e)
+ .long R(36, 1b, 1b, 2d), R(dc, 6e, 6e, b2)
+ .long R(b4, 5a, 5a, ee), R(5b, a0, a0, fb)
+ .long R(a4, 52, 52, f6), R(76, 3b, 3b, 4d)
+ .long R(b7, d6, d6, 61), R(7d, b3, b3, ce)
+ .long R(52, 29, 29, 7b), R(dd, e3, e3, 3e)
+ .long R(5e, 2f, 2f, 71), R(13, 84, 84, 97)
+ .long R(a6, 53, 53, f5), R(b9, d1, d1, 68)
+ .long R(00, 00, 00, 00), R(c1, ed, ed, 2c)
+ .long R(40, 20, 20, 60), R(e3, fc, fc, 1f)
+ .long R(79, b1, b1, c8), R(b6, 5b, 5b, ed)
+ .long R(d4, 6a, 6a, be), R(8d, cb, cb, 46)
+ .long R(67, be, be, d9), R(72, 39, 39, 4b)
+ .long R(94, 4a, 4a, de), R(98, 4c, 4c, d4)
+ .long R(b0, 58, 58, e8), R(85, cf, cf, 4a)
+ .long R(bb, d0, d0, 6b), R(c5, ef, ef, 2a)
+ .long R(4f, aa, aa, e5), R(ed, fb, fb, 16)
+ .long R(86, 43, 43, c5), R(9a, 4d, 4d, d7)
+ .long R(66, 33, 33, 55), R(11, 85, 85, 94)
+ .long R(8a, 45, 45, cf), R(e9, f9, f9, 10)
+ .long R(04, 02, 02, 06), R(fe, 7f, 7f, 81)
+ .long R(a0, 50, 50, f0), R(78, 3c, 3c, 44)
+ .long R(25, 9f, 9f, ba), R(4b, a8, a8, e3)
+ .long R(a2, 51, 51, f3), R(5d, a3, a3, fe)
+ .long R(80, 40, 40, c0), R(05, 8f, 8f, 8a)
+ .long R(3f, 92, 92, ad), R(21, 9d, 9d, bc)
+ .long R(70, 38, 38, 48), R(f1, f5, f5, 04)
+ .long R(63, bc, bc, df), R(77, b6, b6, c1)
+ .long R(af, da, da, 75), R(42, 21, 21, 63)
+ .long R(20, 10, 10, 30), R(e5, ff, ff, 1a)
+ .long R(fd, f3, f3, 0e), R(bf, d2, d2, 6d)
+ .long R(81, cd, cd, 4c), R(18, 0c, 0c, 14)
+ .long R(26, 13, 13, 35), R(c3, ec, ec, 2f)
+ .long R(be, 5f, 5f, e1), R(35, 97, 97, a2)
+ .long R(88, 44, 44, cc), R(2e, 17, 17, 39)
+ .long R(93, c4, c4, 57), R(55, a7, a7, f2)
+ .long R(fc, 7e, 7e, 82), R(7a, 3d, 3d, 47)
+ .long R(c8, 64, 64, ac), R(ba, 5d, 5d, e7)
+ .long R(32, 19, 19, 2b), R(e6, 73, 73, 95)
+ .long R(c0, 60, 60, a0), R(19, 81, 81, 98)
+ .long R(9e, 4f, 4f, d1), R(a3, dc, dc, 7f)
+ .long R(44, 22, 22, 66), R(54, 2a, 2a, 7e)
+ .long R(3b, 90, 90, ab), R(0b, 88, 88, 83)
+ .long R(8c, 46, 46, ca), R(c7, ee, ee, 29)
+ .long R(6b, b8, b8, d3), R(28, 14, 14, 3c)
+ .long R(a7, de, de, 79), R(bc, 5e, 5e, e2)
+ .long R(16, 0b, 0b, 1d), R(ad, db, db, 76)
+ .long R(db, e0, e0, 3b), R(64, 32, 32, 56)
+ .long R(74, 3a, 3a, 4e), R(14, 0a, 0a, 1e)
+ .long R(92, 49, 49, db), R(0c, 06, 06, 0a)
+ .long R(48, 24, 24, 6c), R(b8, 5c, 5c, e4)
+ .long R(9f, c2, c2, 5d), R(bd, d3, d3, 6e)
+ .long R(43, ac, ac, ef), R(c4, 62, 62, a6)
+ .long R(39, 91, 91, a8), R(31, 95, 95, a4)
+ .long R(d3, e4, e4, 37), R(f2, 79, 79, 8b)
+ .long R(d5, e7, e7, 32), R(8b, c8, c8, 43)
+ .long R(6e, 37, 37, 59), R(da, 6d, 6d, b7)
+ .long R(01, 8d, 8d, 8c), R(b1, d5, d5, 64)
+ .long R(9c, 4e, 4e, d2), R(49, a9, a9, e0)
+ .long R(d8, 6c, 6c, b4), R(ac, 56, 56, fa)
+ .long R(f3, f4, f4, 07), R(cf, ea, ea, 25)
+ .long R(ca, 65, 65, af), R(f4, 7a, 7a, 8e)
+ .long R(47, ae, ae, e9), R(10, 08, 08, 18)
+ .long R(6f, ba, ba, d5), R(f0, 78, 78, 88)
+ .long R(4a, 25, 25, 6f), R(5c, 2e, 2e, 72)
+ .long R(38, 1c, 1c, 24), R(57, a6, a6, f1)
+ .long R(73, b4, b4, c7), R(97, c6, c6, 51)
+ .long R(cb, e8, e8, 23), R(a1, dd, dd, 7c)
+ .long R(e8, 74, 74, 9c), R(3e, 1f, 1f, 21)
+ .long R(96, 4b, 4b, dd), R(61, bd, bd, dc)
+ .long R(0d, 8b, 8b, 86), R(0f, 8a, 8a, 85)
+ .long R(e0, 70, 70, 90), R(7c, 3e, 3e, 42)
+ .long R(71, b5, b5, c4), R(cc, 66, 66, aa)
+ .long R(90, 48, 48, d8), R(06, 03, 03, 05)
+ .long R(f7, f6, f6, 01), R(1c, 0e, 0e, 12)
+ .long R(c2, 61, 61, a3), R(6a, 35, 35, 5f)
+ .long R(ae, 57, 57, f9), R(69, b9, b9, d0)
+ .long R(17, 86, 86, 91), R(99, c1, c1, 58)
+ .long R(3a, 1d, 1d, 27), R(27, 9e, 9e, b9)
+ .long R(d9, e1, e1, 38), R(eb, f8, f8, 13)
+ .long R(2b, 98, 98, b3), R(22, 11, 11, 33)
+ .long R(d2, 69, 69, bb), R(a9, d9, d9, 70)
+ .long R(07, 8e, 8e, 89), R(33, 94, 94, a7)
+ .long R(2d, 9b, 9b, b6), R(3c, 1e, 1e, 22)
+ .long R(15, 87, 87, 92), R(c9, e9, e9, 20)
+ .long R(87, ce, ce, 49), R(aa, 55, 55, ff)
+ .long R(50, 28, 28, 78), R(a5, df, df, 7a)
+ .long R(03, 8c, 8c, 8f), R(59, a1, a1, f8)
+ .long R(09, 89, 89, 80), R(1a, 0d, 0d, 17)
+ .long R(65, bf, bf, da), R(d7, e6, e6, 31)
+ .long R(84, 42, 42, c6), R(d0, 68, 68, b8)
+ .long R(82, 41, 41, c3), R(29, 99, 99, b0)
+ .long R(5a, 2d, 2d, 77), R(1e, 0f, 0f, 11)
+ .long R(7b, b0, b0, cb), R(a8, 54, 54, fc)
+ .long R(6d, bb, bb, d6), R(2c, 16, 16, 3a)
+.globl PPC_AES_4K_DECTAB
+PPC_AES_4K_DECTAB:
+/* decryption table, same as crypto_it_tab in crypto/aes-generic.c */
+ .long R(51, f4, a7, 50), R(7e, 41, 65, 53)
+ .long R(1a, 17, a4, c3), R(3a, 27, 5e, 96)
+ .long R(3b, ab, 6b, cb), R(1f, 9d, 45, f1)
+ .long R(ac, fa, 58, ab), R(4b, e3, 03, 93)
+ .long R(20, 30, fa, 55), R(ad, 76, 6d, f6)
+ .long R(88, cc, 76, 91), R(f5, 02, 4c, 25)
+ .long R(4f, e5, d7, fc), R(c5, 2a, cb, d7)
+ .long R(26, 35, 44, 80), R(b5, 62, a3, 8f)
+ .long R(de, b1, 5a, 49), R(25, ba, 1b, 67)
+ .long R(45, ea, 0e, 98), R(5d, fe, c0, e1)
+ .long R(c3, 2f, 75, 02), R(81, 4c, f0, 12)
+ .long R(8d, 46, 97, a3), R(6b, d3, f9, c6)
+ .long R(03, 8f, 5f, e7), R(15, 92, 9c, 95)
+ .long R(bf, 6d, 7a, eb), R(95, 52, 59, da)
+ .long R(d4, be, 83, 2d), R(58, 74, 21, d3)
+ .long R(49, e0, 69, 29), R(8e, c9, c8, 44)
+ .long R(75, c2, 89, 6a), R(f4, 8e, 79, 78)
+ .long R(99, 58, 3e, 6b), R(27, b9, 71, dd)
+ .long R(be, e1, 4f, b6), R(f0, 88, ad, 17)
+ .long R(c9, 20, ac, 66), R(7d, ce, 3a, b4)
+ .long R(63, df, 4a, 18), R(e5, 1a, 31, 82)
+ .long R(97, 51, 33, 60), R(62, 53, 7f, 45)
+ .long R(b1, 64, 77, e0), R(bb, 6b, ae, 84)
+ .long R(fe, 81, a0, 1c), R(f9, 08, 2b, 94)
+ .long R(70, 48, 68, 58), R(8f, 45, fd, 19)
+ .long R(94, de, 6c, 87), R(52, 7b, f8, b7)
+ .long R(ab, 73, d3, 23), R(72, 4b, 02, e2)
+ .long R(e3, 1f, 8f, 57), R(66, 55, ab, 2a)
+ .long R(b2, eb, 28, 07), R(2f, b5, c2, 03)
+ .long R(86, c5, 7b, 9a), R(d3, 37, 08, a5)
+ .long R(30, 28, 87, f2), R(23, bf, a5, b2)
+ .long R(02, 03, 6a, ba), R(ed, 16, 82, 5c)
+ .long R(8a, cf, 1c, 2b), R(a7, 79, b4, 92)
+ .long R(f3, 07, f2, f0), R(4e, 69, e2, a1)
+ .long R(65, da, f4, cd), R(06, 05, be, d5)
+ .long R(d1, 34, 62, 1f), R(c4, a6, fe, 8a)
+ .long R(34, 2e, 53, 9d), R(a2, f3, 55, a0)
+ .long R(05, 8a, e1, 32), R(a4, f6, eb, 75)
+ .long R(0b, 83, ec, 39), R(40, 60, ef, aa)
+ .long R(5e, 71, 9f, 06), R(bd, 6e, 10, 51)
+ .long R(3e, 21, 8a, f9), R(96, dd, 06, 3d)
+ .long R(dd, 3e, 05, ae), R(4d, e6, bd, 46)
+ .long R(91, 54, 8d, b5), R(71, c4, 5d, 05)
+ .long R(04, 06, d4, 6f), R(60, 50, 15, ff)
+ .long R(19, 98, fb, 24), R(d6, bd, e9, 97)
+ .long R(89, 40, 43, cc), R(67, d9, 9e, 77)
+ .long R(b0, e8, 42, bd), R(07, 89, 8b, 88)
+ .long R(e7, 19, 5b, 38), R(79, c8, ee, db)
+ .long R(a1, 7c, 0a, 47), R(7c, 42, 0f, e9)
+ .long R(f8, 84, 1e, c9), R(00, 00, 00, 00)
+ .long R(09, 80, 86, 83), R(32, 2b, ed, 48)
+ .long R(1e, 11, 70, ac), R(6c, 5a, 72, 4e)
+ .long R(fd, 0e, ff, fb), R(0f, 85, 38, 56)
+ .long R(3d, ae, d5, 1e), R(36, 2d, 39, 27)
+ .long R(0a, 0f, d9, 64), R(68, 5c, a6, 21)
+ .long R(9b, 5b, 54, d1), R(24, 36, 2e, 3a)
+ .long R(0c, 0a, 67, b1), R(93, 57, e7, 0f)
+ .long R(b4, ee, 96, d2), R(1b, 9b, 91, 9e)
+ .long R(80, c0, c5, 4f), R(61, dc, 20, a2)
+ .long R(5a, 77, 4b, 69), R(1c, 12, 1a, 16)
+ .long R(e2, 93, ba, 0a), R(c0, a0, 2a, e5)
+ .long R(3c, 22, e0, 43), R(12, 1b, 17, 1d)
+ .long R(0e, 09, 0d, 0b), R(f2, 8b, c7, ad)
+ .long R(2d, b6, a8, b9), R(14, 1e, a9, c8)
+ .long R(57, f1, 19, 85), R(af, 75, 07, 4c)
+ .long R(ee, 99, dd, bb), R(a3, 7f, 60, fd)
+ .long R(f7, 01, 26, 9f), R(5c, 72, f5, bc)
+ .long R(44, 66, 3b, c5), R(5b, fb, 7e, 34)
+ .long R(8b, 43, 29, 76), R(cb, 23, c6, dc)
+ .long R(b6, ed, fc, 68), R(b8, e4, f1, 63)
+ .long R(d7, 31, dc, ca), R(42, 63, 85, 10)
+ .long R(13, 97, 22, 40), R(84, c6, 11, 20)
+ .long R(85, 4a, 24, 7d), R(d2, bb, 3d, f8)
+ .long R(ae, f9, 32, 11), R(c7, 29, a1, 6d)
+ .long R(1d, 9e, 2f, 4b), R(dc, b2, 30, f3)
+ .long R(0d, 86, 52, ec), R(77, c1, e3, d0)
+ .long R(2b, b3, 16, 6c), R(a9, 70, b9, 99)
+ .long R(11, 94, 48, fa), R(47, e9, 64, 22)
+ .long R(a8, fc, 8c, c4), R(a0, f0, 3f, 1a)
+ .long R(56, 7d, 2c, d8), R(22, 33, 90, ef)
+ .long R(87, 49, 4e, c7), R(d9, 38, d1, c1)
+ .long R(8c, ca, a2, fe), R(98, d4, 0b, 36)
+ .long R(a6, f5, 81, cf), R(a5, 7a, de, 28)
+ .long R(da, b7, 8e, 26), R(3f, ad, bf, a4)
+ .long R(2c, 3a, 9d, e4), R(50, 78, 92, 0d)
+ .long R(6a, 5f, cc, 9b), R(54, 7e, 46, 62)
+ .long R(f6, 8d, 13, c2), R(90, d8, b8, e8)
+ .long R(2e, 39, f7, 5e), R(82, c3, af, f5)
+ .long R(9f, 5d, 80, be), R(69, d0, 93, 7c)
+ .long R(6f, d5, 2d, a9), R(cf, 25, 12, b3)
+ .long R(c8, ac, 99, 3b), R(10, 18, 7d, a7)
+ .long R(e8, 9c, 63, 6e), R(db, 3b, bb, 7b)
+ .long R(cd, 26, 78, 09), R(6e, 59, 18, f4)
+ .long R(ec, 9a, b7, 01), R(83, 4f, 9a, a8)
+ .long R(e6, 95, 6e, 65), R(aa, ff, e6, 7e)
+ .long R(21, bc, cf, 08), R(ef, 15, e8, e6)
+ .long R(ba, e7, 9b, d9), R(4a, 6f, 36, ce)
+ .long R(ea, 9f, 09, d4), R(29, b0, 7c, d6)
+ .long R(31, a4, b2, af), R(2a, 3f, 23, 31)
+ .long R(c6, a5, 94, 30), R(35, a2, 66, c0)
+ .long R(74, 4e, bc, 37), R(fc, 82, ca, a6)
+ .long R(e0, 90, d0, b0), R(33, a7, d8, 15)
+ .long R(f1, 04, 98, 4a), R(41, ec, da, f7)
+ .long R(7f, cd, 50, 0e), R(17, 91, f6, 2f)
+ .long R(76, 4d, d6, 8d), R(43, ef, b0, 4d)
+ .long R(cc, aa, 4d, 54), R(e4, 96, 04, df)
+ .long R(9e, d1, b5, e3), R(4c, 6a, 88, 1b)
+ .long R(c1, 2c, 1f, b8), R(46, 65, 51, 7f)
+ .long R(9d, 5e, ea, 04), R(01, 8c, 35, 5d)
+ .long R(fa, 87, 74, 73), R(fb, 0b, 41, 2e)
+ .long R(b3, 67, 1d, 5a), R(92, db, d2, 52)
+ .long R(e9, 10, 56, 33), R(6d, d6, 47, 13)
+ .long R(9a, d7, 61, 8c), R(37, a1, 0c, 7a)
+ .long R(59, f8, 14, 8e), R(eb, 13, 3c, 89)
+ .long R(ce, a9, 27, ee), R(b7, 61, c9, 35)
+ .long R(e1, 1c, e5, ed), R(7a, 47, b1, 3c)
+ .long R(9c, d2, df, 59), R(55, f2, 73, 3f)
+ .long R(18, 14, ce, 79), R(73, c7, 37, bf)
+ .long R(53, f7, cd, ea), R(5f, fd, aa, 5b)
+ .long R(df, 3d, 6f, 14), R(78, 44, db, 86)
+ .long R(ca, af, f3, 81), R(b9, 68, c4, 3e)
+ .long R(38, 24, 34, 2c), R(c2, a3, 40, 5f)
+ .long R(16, 1d, c3, 72), R(bc, e2, 25, 0c)
+ .long R(28, 3c, 49, 8b), R(ff, 0d, 95, 41)
+ .long R(39, a8, 01, 71), R(08, 0c, b3, de)
+ .long R(d8, b4, e4, 9c), R(64, 56, c1, 90)
+ .long R(7b, cb, 84, 61), R(d5, 32, b6, 70)
+ .long R(48, 6c, 5c, 74), R(d0, b8, 57, 42)
+.globl PPC_AES_4K_DECTAB2
+PPC_AES_4K_DECTAB2:
+/* decryption table, same as crypto_il_tab in crypto/aes-generic.c */
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
new file mode 100644
index 000000000..0153a9c6f
--- /dev/null
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
@@ -0,0 +1,137 @@
+/*
+ * CRC vpmsum tester
+ * Copyright 2017 Daniel Axtens, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+static unsigned long iterations = 10000;
+
+#define MAX_CRC_LENGTH 65535
+
+
+static int __init crc_test_init(void)
+{
+ u16 crc16 = 0, verify16 = 0;
+ u32 crc32 = 0, verify32 = 0;
+ __le32 verify32le = 0;
+ unsigned char *data;
+ unsigned long i;
+ int ret;
+
+ struct crypto_shash *crct10dif_tfm;
+ struct crypto_shash *crc32c_tfm;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+ return -ENODEV;
+
+ data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
+
+ if (IS_ERR(crct10dif_tfm)) {
+ pr_err("Error allocating crc-t10dif\n");
+ goto free_buf;
+ }
+
+ crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
+
+ if (IS_ERR(crc32c_tfm)) {
+ pr_err("Error allocating crc32c\n");
+ goto free_16;
+ }
+
+ do {
+ SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
+ SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
+
+ crct10dif_shash->tfm = crct10dif_tfm;
+ ret = crypto_shash_init(crct10dif_shash);
+
+ if (ret) {
+ pr_err("Error initing crc-t10dif\n");
+ goto free_32;
+ }
+
+
+ crc32c_shash->tfm = crc32c_tfm;
+ ret = crypto_shash_init(crc32c_shash);
+
+ if (ret) {
+ pr_err("Error initing crc32c\n");
+ goto free_32;
+ }
+
+ pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
+ for (i=0; i<iterations; i++) {
+ size_t len, offset;
+
+ get_random_bytes(data, MAX_CRC_LENGTH);
+ get_random_bytes(&len, sizeof(len));
+ get_random_bytes(&offset, sizeof(offset));
+
+ len %= MAX_CRC_LENGTH;
+ offset &= 15;
+ if (len <= offset)
+ continue;
+ len -= offset;
+
+ crypto_shash_update(crct10dif_shash, data+offset, len);
+ crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
+ verify16 = crc_t10dif_generic(verify16, data+offset, len);
+
+
+ if (crc16 != verify16) {
+ pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
+ crc16, verify16, len);
+ break;
+ }
+
+ crypto_shash_update(crc32c_shash, data+offset, len);
+ crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
+ verify32 = le32_to_cpu(verify32le);
+ verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
+ if (crc32 != (u32)verify32le) {
+ pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
+ crc32, verify32, len);
+ break;
+ }
+ }
+ pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
+ } while (0);
+
+free_32:
+ crypto_free_shash(crc32c_tfm);
+
+free_16:
+ crypto_free_shash(crct10dif_tfm);
+
+free_buf:
+ kfree(data);
+
+ return 0;
+}
+
+static void __exit crc_test_exit(void) {}
+
+module_init(crc_test_init);
+module_exit(crc_test_exit);
+module_param(iterations, long, 0400);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
new file mode 100644
index 000000000..aadb59c96
--- /dev/null
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -0,0 +1,755 @@
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE 32768
+
+ .text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16 r25
+#define off32 r26
+#define off48 r27
+#define off64 r28
+#define off80 r29
+#define off96 r30
+#define off112 r31
+
+#define const1 v24
+#define const2 v25
+
+#define byteswap v26
+#define mask_32bit v27
+#define mask_64bit v28
+#define zeroes v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+ std r26,-48(r1)
+ std r25,-56(r1)
+
+ li off16,16
+ li off32,32
+ li off48,48
+ li off64,64
+ li off80,80
+ li off96,96
+ li off112,112
+ li r0,0
+
+ /* Enough room for saving 10 non volatile VMX registers */
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ stvx v20,0,r6
+ stvx v21,off16,r6
+ stvx v22,off32,r6
+ stvx v23,off48,r6
+ stvx v24,off64,r6
+ stvx v25,off80,r6
+ stvx v26,off96,r6
+ stvx v27,off112,r6
+ stvx v28,0,r7
+ stvx v29,off16,r7
+
+ mr r10,r3
+
+ vxor zeroes,zeroes,zeroes
+ vspltisw v0,-1
+
+ vsldoi mask_32bit,zeroes,v0,4
+ vsldoi mask_64bit,zeroes,v0,8
+
+ /* Get the initial value into v8 */
+ vxor v8,v8,v8
+ MTVRD(v8, R3)
+#ifdef REFLECT
+ vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
+#else
+ vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+ addis r3,r2,.byteswap_constant@toc@ha
+ addi r3,r3,.byteswap_constant@toc@l
+
+ lvx byteswap,0,r3
+ addi r3,r3,16
+#endif
+
+ cmpdi r5,256
+ blt .Lshort
+
+ rldicr r6,r5,0,56
+
+ /* Checksum in blocks of MAX_SIZE */
+1: lis r7,MAX_SIZE@h
+ ori r7,r7,MAX_SIZE@l
+ mr r9,r7
+ cmpd r6,r7
+ bgt 2f
+ mr r7,r6
+2: subf r6,r7,r6
+
+ /* our main loop does 128 bytes at a time */
+ srdi r7,r7,7
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi r8,r7,4
+ srdi r9,r9,3
+ subf r8,r8,r9
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi r7,r7,-1
+ mtctr r7
+
+ addis r3,r2,.constants@toc@ha
+ addi r3,r3,.constants@toc@l
+
+ /* Find the start of our constants */
+ add r3,r3,r8
+
+ /* zero v0-v7 which will contain our checksums */
+ vxor v0,v0,v0
+ vxor v1,v1,v1
+ vxor v2,v2,v2
+ vxor v3,v3,v3
+ vxor v4,v4,v4
+ vxor v5,v5,v5
+ vxor v6,v6,v6
+ vxor v7,v7,v7
+
+ lvx const1,0,r3
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in v16-v23.
+ */
+ cmpdi r0,1
+ beq 2f
+
+ /* First warm up pass */
+ lvx v16,0,r4
+ lvx v17,off16,r4
+ VPERM(v16,v16,v16,byteswap)
+ VPERM(v17,v17,v17,byteswap)
+ lvx v18,off32,r4
+ lvx v19,off48,r4
+ VPERM(v18,v18,v18,byteswap)
+ VPERM(v19,v19,v19,byteswap)
+ lvx v20,off64,r4
+ lvx v21,off80,r4
+ VPERM(v20,v20,v20,byteswap)
+ VPERM(v21,v21,v21,byteswap)
+ lvx v22,off96,r4
+ lvx v23,off112,r4
+ VPERM(v22,v22,v22,byteswap)
+ VPERM(v23,v23,v23,byteswap)
+ addi r4,r4,8*16
+
+ /* xor in initial value */
+ vxor v16,v16,v8
+
+2: bdz .Lfirst_warm_up_done
+
+ addi r3,r3,16
+ lvx const2,0,r3
+
+ /* Second warm up pass */
+ VPMSUMD(v8,v16,const1)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v9,v17,const1)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v10,v18,const1)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v11,v19,const1)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdz .Lfirst_cool_down
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ .balign 16
+4: lvx const1,0,r3
+ addi r3,r3,16
+ ori r2,r2,0
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const2)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const2)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const2)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const2)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ lvx const2,0,r3
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdnz 4b
+
+.Lfirst_cool_down:
+ /* First cool down pass */
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const1)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const1)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const1)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const1)
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ ori r2,r2,0
+
+.Lsecond_cool_down:
+ /* Second cool down pass */
+ vxor v0,v0,v8
+ vxor v1,v1,v9
+ vxor v2,v2,v10
+ vxor v3,v3,v11
+ vxor v4,v4,v12
+ vxor v5,v5,v13
+ vxor v6,v6,v14
+ vxor v7,v7,v15
+
+#ifdef REFLECT
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi v0,v0,zeroes,4
+ vsldoi v1,v1,zeroes,4
+ vsldoi v2,v2,zeroes,4
+ vsldoi v3,v3,zeroes,4
+ vsldoi v4,v4,zeroes,4
+ vsldoi v5,v5,zeroes,4
+ vsldoi v6,v6,zeroes,4
+ vsldoi v7,v7,zeroes,4
+#endif
+
+ /* xor with last 1024 bits */
+ lvx v8,0,r4
+ lvx v9,off16,r4
+ VPERM(v8,v8,v8,byteswap)
+ VPERM(v9,v9,v9,byteswap)
+ lvx v10,off32,r4
+ lvx v11,off48,r4
+ VPERM(v10,v10,v10,byteswap)
+ VPERM(v11,v11,v11,byteswap)
+ lvx v12,off64,r4
+ lvx v13,off80,r4
+ VPERM(v12,v12,v12,byteswap)
+ VPERM(v13,v13,v13,byteswap)
+ lvx v14,off96,r4
+ lvx v15,off112,r4
+ VPERM(v14,v14,v14,byteswap)
+ VPERM(v15,v15,v15,byteswap)
+
+ addi r4,r4,8*16
+
+ vxor v16,v0,v8
+ vxor v17,v1,v9
+ vxor v18,v2,v10
+ vxor v19,v3,v11
+ vxor v20,v4,v12
+ vxor v21,v5,v13
+ vxor v22,v6,v14
+ vxor v23,v7,v15
+
+ li r0,1
+ cmpdi r6,0
+ addi r6,r6,128
+ bne 1b
+
+ /* Work out how many bytes we have left */
+ andi. r5,r5,127
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,128
+ add r3,r3,r6
+
+ /* How many 16 byte chunks are in the tail */
+ srdi r7,r5,4
+ mtctr r7
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx v0,0,r3
+ lvx v1,off16,r3
+ lvx v2,off32,r3
+ lvx v3,off48,r3
+ lvx v4,off64,r3
+ lvx v5,off80,r3
+ lvx v6,off96,r3
+ lvx v7,off112,r3
+ addi r3,r3,8*16
+
+ VPMSUMW(v0,v16,v0)
+ VPMSUMW(v1,v17,v1)
+ VPMSUMW(v2,v18,v2)
+ VPMSUMW(v3,v19,v3)
+ VPMSUMW(v4,v20,v4)
+ VPMSUMW(v5,v21,v5)
+ VPMSUMW(v6,v22,v6)
+ VPMSUMW(v7,v23,v7)
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi r7,0
+ beq 1f
+
+ lvx v16,0,r4
+ lvx v17,0,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off16,r4
+ lvx v17,off16,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off32,r4
+ lvx v17,off32,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off48,r4
+ lvx v17,off48,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off64,r4
+ lvx v17,off64,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off80,r4
+ lvx v17,off80,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off96,r4
+ lvx v17,off96,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+
+ /* Now xor all the parallel chunks together */
+1: vxor v0,v0,v1
+ vxor v2,v2,v3
+ vxor v4,v4,v5
+ vxor v6,v6,v7
+
+ vxor v0,v0,v2
+ vxor v4,v4,v6
+
+ vxor v0,v0,v4
+
+.Lbarrett_reduction:
+ /* Barrett constants */
+ addis r3,r2,.barrett_constants@toc@ha
+ addi r3,r3,.barrett_constants@toc@l
+
+ lvx const1,0,r3
+ lvx const2,off16,r3
+
+ vsldoi v1,v0,v0,8
+ vxor v0,v0,v1 /* xor two 64 bit results together */
+
+#ifdef REFLECT
+ /* shift left one bit */
+ vspltisb v1,1
+ vsl v0,v0,v1
+#endif
+
+ vand v0,v0,mask_64bit
+#ifndef REFLECT
+ /*
+ * Now for the Barrett reduction algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v1,v0,const1) /* ma */
+ vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
+#else
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v1,v0,mask_32bit /* bottom 32 bits of a */
+ VPMSUMD(v1,v1,const1) /* ma */
+ vand v1,v1,mask_32bit /* bottom 32bits of ma */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
+#endif
+
+ /* Get it into r3 */
+ MFVRD(R3, v0)
+
+.Lout:
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ lvx v20,0,r6
+ lvx v21,off16,r6
+ lvx v22,off32,r6
+ lvx v23,off48,r6
+ lvx v24,off64,r6
+ lvx v25,off80,r6
+ lvx v26,off96,r6
+ lvx v27,off112,r6
+ lvx v28,0,r7
+ lvx v29,off16,r7
+
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+
+ blr
+
+.Lfirst_warm_up_done:
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ VPMSUMD(v8,v16,const1)
+ VPMSUMD(v9,v17,const1)
+ VPMSUMD(v10,v18,const1)
+ VPMSUMD(v11,v19,const1)
+ VPMSUMD(v12,v20,const1)
+ VPMSUMD(v13,v21,const1)
+ VPMSUMD(v14,v22,const1)
+ VPMSUMD(v15,v23,const1)
+
+ b .Lsecond_cool_down
+
+.Lshort:
+ cmpdi r5,0
+ beq .Lzero
+
+ addis r3,r2,.short_constants@toc@ha
+ addi r3,r3,.short_constants@toc@l
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,256
+ add r3,r3,r6
+
+ /* How many 16 byte chunks? */
+ srdi r7,r5,4
+ mtctr r7
+
+ vxor v19,v19,v19
+ vxor v20,v20,v20
+
+ lvx v0,0,r4
+ lvx v16,0,r3
+ VPERM(v0,v0,v16,byteswap)
+ vxor v0,v0,v8 /* xor in initial value */
+ VPMSUMW(v0,v0,v16)
+ bdz .Lv0
+
+ lvx v1,off16,r4
+ lvx v17,off16,r3
+ VPERM(v1,v1,v17,byteswap)
+ VPMSUMW(v1,v1,v17)
+ bdz .Lv1
+
+ lvx v2,off32,r4
+ lvx v16,off32,r3
+ VPERM(v2,v2,v16,byteswap)
+ VPMSUMW(v2,v2,v16)
+ bdz .Lv2
+
+ lvx v3,off48,r4
+ lvx v17,off48,r3
+ VPERM(v3,v3,v17,byteswap)
+ VPMSUMW(v3,v3,v17)
+ bdz .Lv3
+
+ lvx v4,off64,r4
+ lvx v16,off64,r3
+ VPERM(v4,v4,v16,byteswap)
+ VPMSUMW(v4,v4,v16)
+ bdz .Lv4
+
+ lvx v5,off80,r4
+ lvx v17,off80,r3
+ VPERM(v5,v5,v17,byteswap)
+ VPMSUMW(v5,v5,v17)
+ bdz .Lv5
+
+ lvx v6,off96,r4
+ lvx v16,off96,r3
+ VPERM(v6,v6,v16,byteswap)
+ VPMSUMW(v6,v6,v16)
+ bdz .Lv6
+
+ lvx v7,off112,r4
+ lvx v17,off112,r3
+ VPERM(v7,v7,v17,byteswap)
+ VPMSUMW(v7,v7,v17)
+ bdz .Lv7
+
+ addi r3,r3,128
+ addi r4,r4,128
+
+ lvx v8,0,r4
+ lvx v16,0,r3
+ VPERM(v8,v8,v16,byteswap)
+ VPMSUMW(v8,v8,v16)
+ bdz .Lv8
+
+ lvx v9,off16,r4
+ lvx v17,off16,r3
+ VPERM(v9,v9,v17,byteswap)
+ VPMSUMW(v9,v9,v17)
+ bdz .Lv9
+
+ lvx v10,off32,r4
+ lvx v16,off32,r3
+ VPERM(v10,v10,v16,byteswap)
+ VPMSUMW(v10,v10,v16)
+ bdz .Lv10
+
+ lvx v11,off48,r4
+ lvx v17,off48,r3
+ VPERM(v11,v11,v17,byteswap)
+ VPMSUMW(v11,v11,v17)
+ bdz .Lv11
+
+ lvx v12,off64,r4
+ lvx v16,off64,r3
+ VPERM(v12,v12,v16,byteswap)
+ VPMSUMW(v12,v12,v16)
+ bdz .Lv12
+
+ lvx v13,off80,r4
+ lvx v17,off80,r3
+ VPERM(v13,v13,v17,byteswap)
+ VPMSUMW(v13,v13,v17)
+ bdz .Lv13
+
+ lvx v14,off96,r4
+ lvx v16,off96,r3
+ VPERM(v14,v14,v16,byteswap)
+ VPMSUMW(v14,v14,v16)
+ bdz .Lv14
+
+ lvx v15,off112,r4
+ lvx v17,off112,r3
+ VPERM(v15,v15,v17,byteswap)
+ VPMSUMW(v15,v15,v17)
+
+.Lv15: vxor v19,v19,v15
+.Lv14: vxor v20,v20,v14
+.Lv13: vxor v19,v19,v13
+.Lv12: vxor v20,v20,v12
+.Lv11: vxor v19,v19,v11
+.Lv10: vxor v20,v20,v10
+.Lv9: vxor v19,v19,v9
+.Lv8: vxor v20,v20,v8
+.Lv7: vxor v19,v19,v7
+.Lv6: vxor v20,v20,v6
+.Lv5: vxor v19,v19,v5
+.Lv4: vxor v20,v20,v4
+.Lv3: vxor v19,v19,v3
+.Lv2: vxor v20,v20,v2
+.Lv1: vxor v19,v19,v1
+.Lv0: vxor v20,v20,v0
+
+ vxor v0,v19,v20
+
+ b .Lbarrett_reduction
+
+.Lzero:
+ mr r3,r10
+ b .Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
new file mode 100644
index 000000000..d2bea4805
--- /dev/null
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -0,0 +1,846 @@
+/*
+ * Calculate a crc32c with vpmsum acceleration
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ .octa 0x00000000b6ca9e20000000009c37c408
+
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ .octa 0x00000000350249a800000001b51df26c
+
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ .octa 0x00000001862dac54000000000724b9d0
+
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ .octa 0x00000001d87fb48c00000001c00532fe
+
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ .octa 0x00000001f39b699e00000000f05a9362
+
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ .octa 0x0000000101da11b400000001e1007970
+
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ .octa 0x00000001cab571e000000000a57366ee
+
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ .octa 0x00000000c7020cfe0000000192011284
+
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ .octa 0x00000000cdaed1ae0000000162716d9a
+
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ .octa 0x00000001e804effc00000000cd97ecde
+
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ .octa 0x0000000077c3ea3a0000000058812bc0
+
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ .octa 0x0000000068df31b40000000088b8c12e
+
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ .octa 0x00000000b059b6c200000001230b234c
+
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ .octa 0x0000000145fb8ed800000001120b416e
+
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ .octa 0x00000000cbc0916800000001974aecb0
+
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ .octa 0x000000005ceeedc2000000008ee3f226
+
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ .octa 0x0000000047d74e8600000001089aba9a
+
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ .octa 0x00000001407e9e220000000065113872
+
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ .octa 0x00000001da967bda000000005c07ec10
+
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ .octa 0x000000006c8983680000000187590924
+
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ .octa 0x00000000f2d14c9800000000e35da7c6
+
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ .octa 0x00000001993c6ad4000000000415855a
+
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ .octa 0x000000014683d1ac0000000073617758
+
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ .octa 0x00000001a7c93e6c0000000176021d28
+
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ .octa 0x000000010211e90a00000001c358fd0a
+
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ .octa 0x000000001119403e00000001ff7a2c18
+
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ .octa 0x000000001c3261aa00000000f2d9f7e4
+
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ .octa 0x000000014e37a634000000016cf1f9c8
+
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ .octa 0x0000000073786c0c000000010af9279a
+
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ .octa 0x000000011dc037f80000000004f101e8
+
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ .octa 0x0000000031433dfc0000000070bcf184
+
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ .octa 0x000000009cde8348000000000a8de642
+
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ .octa 0x0000000038d3c2a60000000062ea130c
+
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ .octa 0x000000011b25f26000000001eb31cbb2
+
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ .octa 0x000000001629e6f00000000170783448
+
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ .octa 0x0000000160838b4c00000001a684b4c6
+
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ .octa 0x000000007a44011c00000000253ca5b4
+
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ .octa 0x00000000226f417a0000000057b4b1e2
+
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ .octa 0x0000000045eb2eb400000000b6bd084c
+
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ .octa 0x000000014459d70c0000000123c2d592
+
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ .octa 0x00000001d406ed8200000000159dafce
+
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ .octa 0x0000000160c8e1a80000000127e1a64e
+
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ .octa 0x0000000027ba80980000000056860754
+
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ .octa 0x000000006d92d01800000001e661aae8
+
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ .octa 0x000000012ed7e3f200000000f82c6166
+
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ .octa 0x000000002dc8778800000000c4f9c7ae
+
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ .octa 0x0000000018240bb80000000074203d20
+
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ .octa 0x000000001ad381580000000198173052
+
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ .octa 0x00000001396b78f200000001ce8aba54
+
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ .octa 0x000000011a68133400000001850d5d94
+
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ .octa 0x000000012104732e00000001d609239c
+
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ .octa 0x00000000a140d90c000000001595f048
+
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ .octa 0x00000001b7215eda0000000042ccee08
+
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ .octa 0x00000001aaf1df3c000000010a389d74
+
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ .octa 0x0000000029d15b8a000000012a840da6
+
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ .octa 0x00000000f1a96922000000001d181c0c
+
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ .octa 0x00000001ac80d03c0000000068b7d1f6
+
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ .octa 0x000000000f11d56a000000005b0f14fc
+
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ .octa 0x00000001f1c022a20000000179e9e730
+
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ .octa 0x0000000173d00ae200000001ce1368d6
+
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ .octa 0x000000016edc5ae400000000de940fee
+
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ .octa 0x00000001f1a0214000000000fe896b7e
+
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ .octa 0x00000000ca0b28a000000001f797431c
+
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ .octa 0x00000001928e30a20000000053e989ba
+
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ .octa 0x0000000097b1b002000000003920cd16
+
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ .octa 0x00000000b15bf90600000001e6f579b8
+
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ .octa 0x00000000411c5d52000000007493cb0a
+
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ .octa 0x00000001c36f330000000001bdd376d8
+
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ .octa 0x00000001119227e0000000016badfee6
+
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ .octa 0x00000000114d47020000000071de5c58
+
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ .octa 0x00000000458b5b9800000000453f317c
+
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ .octa 0x000000012e31fb8e0000000121675cce
+
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ .octa 0x000000005cf619d800000001f409ee92
+
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ .octa 0x0000000063f4d8b200000000f36b9c88
+
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ .octa 0x000000004138dc8a0000000036b398f4
+
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ .octa 0x00000001d29ee8e000000001748f9adc
+
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ .octa 0x000000006a08ace800000001be94ec00
+
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ .octa 0x0000000127d4201000000000b74370d6
+
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ .octa 0x0000000019d76b6200000001174d0b98
+
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ .octa 0x00000001b1471f6e00000000befc06a4
+
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ .octa 0x00000001f64c19cc00000001ae125288
+
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ .octa 0x00000000003c0ea00000000095c19b34
+
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ .octa 0x000000014d73abf600000001a78496f2
+
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ .octa 0x00000001620eb84400000001ac5390a0
+
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ .octa 0x0000000147655048000000002a80ed6e
+
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ .octa 0x0000000067b5077e00000001fa9b0128
+
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ .octa 0x0000000010ffe20600000001ea94929e
+
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ .octa 0x000000000fee8f1e0000000125f4305c
+
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ .octa 0x00000001da26fbae00000001471e2002
+
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ .octa 0x00000001b3a8bd880000000132d2253a
+
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ .octa 0x00000000e8f3898e00000000f26b3592
+
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ .octa 0x0000000030f2a798000000013a826ef2
+
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ .octa 0x000000000fba10020000000081482c84
+
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ .octa 0x00000000bdb9bd7200000000e77307c2
+
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ .octa 0x00000000ef1f98a00000000017102100
+
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ .octa 0x00000000689c760200000000db406486
+
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ .octa 0x000000016d5fa5fe0000000192db7f88
+
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ .octa 0x0000000041e7b470000000007c09163e
+
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ .octa 0x00000001cbb6495e000000000adac060
+
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ .octa 0x000000010052a0b000000000bd8316ae
+
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ .octa 0x00000001d8effb5c000000019f09ab54
+
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ .octa 0x00000001d969853c0000000125155542
+
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ .octa 0x00000000523ccce2000000018fdb5882
+
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ .octa 0x000000001e2436bc00000000e794b3f4
+
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ .octa 0x00000000ddd1c3a2000000016f9bb022
+
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ .octa 0x0000000019fcfe3800000000290c9978
+
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ .octa 0x00000001ce95db640000000083c0f350
+
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ .octa 0x00000000af5828060000000173ea6628
+
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ .octa 0x00000001006388f600000001c8b4e00a
+
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ .octa 0x0000000179eca00a00000000de95d6aa
+
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ .octa 0x0000000122410a6a000000010b7f7248
+
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ .octa 0x000000004288e87c00000001326e3a06
+
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ .octa 0x000000016c5490da00000000bb62c2e6
+
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ .octa 0x00000001b4ce08a6000000011dfe763a
+
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ .octa 0x00000001466ba60c000000007bcca8e2
+
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ .octa 0x00000001f6c488a40000000186118faa
+
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ .octa 0x000000013bfb06820000000111a65a88
+
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ .octa 0x00000000690e9e54000000003565e1c4
+
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ .octa 0x00000000281346b6000000012ed02a82
+
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ .octa 0x000000015646402400000000c486ecfc
+
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ .octa 0x000000016063a8dc0000000001b951b2
+
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ .octa 0x0000000116a663620000000048143916
+
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ .octa 0x000000017e8aa4d200000001dc2ae124
+
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ .octa 0x00000001728eb10c00000001416c58d6
+
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ .octa 0x00000001b08fd7fa00000000a479744a
+
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ .octa 0x00000001092a16e80000000096ca3a26
+
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ .octa 0x00000000a505637c00000000ff223d4e
+
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ .octa 0x00000000d94869b2000000010e84da42
+
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ .octa 0x00000001c8b203ae00000001b61ba3d0
+
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ .octa 0x000000005704aea000000000680f2de8
+
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ .octa 0x000000012e295fa2000000008772a9a8
+
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ .octa 0x000000011d0908bc0000000155f295bc
+
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ .octa 0x0000000193ed97ea00000000595f9282
+
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ .octa 0x000000013a0f1c520000000164b1c25a
+
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ .octa 0x000000010c2c40c000000000fbd67c50
+
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ .octa 0x00000000ff6fac3e0000000096076268
+
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ .octa 0x000000017b3609c000000001d288e4cc
+
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ .octa 0x0000000088c8c92200000001eaac1bdc
+
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ .octa 0x00000001751baae600000001f1ea39e2
+
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ .octa 0x000000010795297200000001eb6506fc
+
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ .octa 0x0000000162b00abe000000010f806ffe
+
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ .octa 0x000000000d7b404c000000010408481e
+
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ .octa 0x00000000763b13d40000000188260534
+
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ .octa 0x00000000f6dc22d80000000058fc73e0
+
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ .octa 0x000000007daae06000000000391c59b8
+
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ .octa 0x000000013359ab7c000000018b638400
+
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ .octa 0x000000008add438a000000011738f5c4
+
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ .octa 0x00000001edbefdea000000008cf7c6da
+
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ .octa 0x000000004104e0f800000001ef97fb16
+
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ .octa 0x00000000b48a82220000000102130e20
+
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ .octa 0x00000001bcb4684400000000db968898
+
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ .octa 0x000000013293ce0a00000000b5047b5e
+
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ .octa 0x00000001710d0844000000010b90fdb2
+
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ .octa 0x0000000117907f6e000000004834a32e
+
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ .octa 0x0000000087ddf93e0000000059c8f2b0
+
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ .octa 0x000000005970e9b00000000122cec508
+
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ .octa 0x0000000185b2b7d0000000000a330cda
+
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ .octa 0x00000001dcee0efc000000014a47148c
+
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ .octa 0x0000000030da27220000000042c61cb8
+
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ .octa 0x000000012f925a180000000012fe6960
+
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ .octa 0x00000000dd2e357c00000000dbda2c20
+
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ .octa 0x00000000071c80de000000011122410c
+
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ .octa 0x000000011513140a00000000977b2070
+
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ .octa 0x00000001df876e8e000000014050438e
+
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ .octa 0x000000015f81d6ce0000000147c840e8
+
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ .octa 0x000000019dd94dbe00000001cc7c88ce
+
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ .octa 0x00000001373d206e00000001476b35a4
+
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ .octa 0x00000000668ccade000000013d52d508
+
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ .octa 0x00000001b192d268000000008e4be32e
+
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ .octa 0x00000000e30f3a7800000000024120fe
+
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ .octa 0x000000010ef1f7bc00000000ddecddb4
+
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ .octa 0x00000001f5ac738000000000d4d403bc
+
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ .octa 0x000000011822ea7000000001734b89aa
+
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ .octa 0x00000000c3a33848000000010e7a58d6
+
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ .octa 0x00000001bd151c2400000001f9f04e9c
+
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ .octa 0x0000000056002d7600000000b692225e
+
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ .octa 0x000000014657c4f4000000019b8d3f3e
+
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ .octa 0x0000000113742d7c00000001a874f11e
+
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ .octa 0x000000019c5920ba000000010d5a4254
+
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ .octa 0x000000005216d2d600000000bbb2f5d6
+
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ .octa 0x0000000136f5ad8a0000000179cc0e36
+
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ .octa 0x000000018b07beb600000001dca1da4a
+
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ .octa 0x00000000db1e93b000000000feb1a192
+
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ .octa 0x000000000b96fa3a00000000d1eeedd6
+
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ .octa 0x00000001d9968af0000000008fad9bb4
+
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ .octa 0x000000000e4a77a200000001884938e4
+
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ .octa 0x00000000508c2ac800000001bc2e9bc0
+
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ .octa 0x0000000021572a8000000001f9658a68
+
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ .octa 0x00000001b859daf2000000001b9224fc
+
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ .octa 0x000000016f7884740000000055b2fb84
+
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ .octa 0x00000001b438810e000000018b090348
+
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ .octa 0x00000001d977c20c0000000007ae47f8
+
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ .octa 0x00000000ebedb99a0000000172acbec0
+
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ .octa 0x00000001df9e9e9200000001c6e3ff20
+
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ .octa 0x00000001a4a3f95200000000e1b38744
+
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ .octa 0x00000000e2f5122000000000791585b2
+
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ .octa 0x000000004aa01f3e00000000ac53b894
+
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ .octa 0x000000000c9ca2aa00000001df48b2e0
+
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ .octa 0x000000015168231600000000049c1c62
+
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ .octa 0x0000000036fce78c000000017c460c12
+
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ .octa 0x000000009037dc10000000015be4da7e
+
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ .octa 0x00000000d3298582000000010f38f668
+
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ .octa 0x00000001b42e8ad60000000039f40a00
+
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ .octa 0x00000000142a983800000000bd4c10c4
+
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ .octa 0x0000000109c7f1900000000042db1d98
+
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ .octa 0x0000000056ff931000000001c905bae6
+
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ .octa 0x00000001594513aa00000000069d40ea
+
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ .octa 0x000000011dd5fc080000000047bedd46
+
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ .octa 0x00000001675f0cc20000000026396bf8
+
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ .octa 0x00000000d1c8dd4400000000379beb92
+
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ .octa 0x0000000115ebd3d8000000000abae54a
+
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ .octa 0x00000001ecbd0dac0000000007e6a128
+
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ .octa 0x00000000cdf67af2000000000ade29d2
+
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ .octa 0x000000004c01ff4c00000000f974c45c
+
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ .octa 0x00000000f2d8657e00000000e77ac60a
+
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ .octa 0x000000006bae74c40000000145895816
+
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ .octa 0x0000000152af8aa00000000038e362be
+
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ .octa 0x0000000004663802000000007f991a64
+
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ .octa 0x00000001ab2f5afc00000000fa366d3a
+
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ .octa 0x0000000074a4ebd400000001a2bb34f0
+
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ .octa 0x00000001d7ab3a4c0000000028a9981e
+
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ .octa 0x00000001a8da60c600000001dbc672be
+
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ .octa 0x000000013cf6382000000000b04d77f6
+
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ .octa 0x00000000bec12e1e0000000124400d96
+
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ .octa 0x00000001c6368010000000014ca4b414
+
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ .octa 0x00000001e6e78758000000012fe2c938
+
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ .octa 0x000000008d7f2b3c00000001faed01e6
+
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ .octa 0x000000016b4a156e000000007e80ecfe
+
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ .octa 0x00000001c63cfeb60000000098daee94
+
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ .octa 0x000000015f902670000000010a04edea
+
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ .octa 0x00000001cd5de11e00000001c00b4524
+
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ .octa 0x000000001acaec540000000170296550
+
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ .octa 0x000000002bd0ca780000000181afaa48
+
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ .octa 0x0000000032d63d5c0000000185a31ffa
+
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ .octa 0x000000001c6d4e4c000000002469f608
+
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ .octa 0x0000000106a60b92000000006980102a
+
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ .octa 0x00000000d3855e120000000111ea9ca8
+
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ .octa 0x00000000e312563600000001bd1d29ce
+
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ .octa 0x000000009e8f7ea400000001b34b9580
+
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ .octa 0x00000001c82e562c000000003076054e
+
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ .octa 0x00000000ca9f09ce000000012a608ea4
+
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ .octa 0x00000000c63764e600000000784d05fe
+
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ .octa 0x0000000168d2e49e000000016ef0d82a
+
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ .octa 0x00000000e986c1480000000075bda454
+
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ .octa 0x00000000cfb65894000000003dc0a1c4
+
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ .octa 0x0000000111cadee400000000e9a5d8be
+
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ .octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+ .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+ /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+ .octa 0x38e888d4844752a9963a18920246e2e6
+
+ /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+ .octa 0x42316c00730206ad419a441956993a31
+
+ /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+ .octa 0x543d5c543e65ddf9924752ba2b830011
+
+ /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+ .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+ /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+ .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+ /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+ .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+ /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+ .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+ /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+ .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+ /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+ .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+ /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+ .octa 0x041d37768cd75659817cdc5119b29a35
+
+ /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+ .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+ /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+ .octa 0x0e148e8252377a554f256efcb82be955
+
+ /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+ .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+ /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+ .octa 0x790606ff9957c0a65d27e147510ac59a
+
+ /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+ .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x00000000000000000000000105ec76f1
+
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
+#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
new file mode 100644
index 000000000..fd1d6c83f
--- /dev/null
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -0,0 +1,172 @@
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 512
+
+u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+ return __crc32c_le(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = __crc32c_le(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = __crc32c_le(crc, p, tail);
+ }
+
+ return crc;
+}
+
+static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_vpmsum_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32c_vpmsum(*crcp, data, len);
+
+ return 0;
+}
+
+static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len));
+
+ return 0;
+}
+
+static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = ~cpu_to_le32p(crcp);
+
+ return 0;
+}
+
+static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static struct shash_alg alg = {
+ .setkey = crc32c_vpmsum_setkey,
+ .init = crc32c_vpmsum_init,
+ .update = crc32c_vpmsum_update,
+ .final = crc32c_vpmsum_final,
+ .finup = crc32c_vpmsum_finup,
+ .digest = crc32c_vpmsum_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-vpmsum",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32c_vpmsum_cra_init,
+ }
+};
+
+static int __init crc32c_vpmsum_mod_init(void)
+{
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32c_vpmsum_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init);
+module_exit(crc32c_vpmsum_mod_fini);
+
+MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
+MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-vpmsum");
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000..5e3d81a0a
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
@@ -0,0 +1,850 @@
+/*
+ * Calculate a CRC T10DIF with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ * and is available under the GPL v2 or later.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261184 mod p(x), x^261120 mod p(x) */
+ .octa 0x0000000056d300000000000052550000
+
+ /* x^260160 mod p(x), x^260096 mod p(x) */
+ .octa 0x00000000ee67000000000000a1e40000
+
+ /* x^259136 mod p(x), x^259072 mod p(x) */
+ .octa 0x0000000060830000000000004ad10000
+
+ /* x^258112 mod p(x), x^258048 mod p(x) */
+ .octa 0x000000008cfe0000000000009ab40000
+
+ /* x^257088 mod p(x), x^257024 mod p(x) */
+ .octa 0x000000003e93000000000000fdb50000
+
+ /* x^256064 mod p(x), x^256000 mod p(x) */
+ .octa 0x000000003c2000000000000045480000
+
+ /* x^255040 mod p(x), x^254976 mod p(x) */
+ .octa 0x00000000b1fc0000000000008d690000
+
+ /* x^254016 mod p(x), x^253952 mod p(x) */
+ .octa 0x00000000f82b00000000000024ad0000
+
+ /* x^252992 mod p(x), x^252928 mod p(x) */
+ .octa 0x0000000044420000000000009f1a0000
+
+ /* x^251968 mod p(x), x^251904 mod p(x) */
+ .octa 0x00000000e88c00000000000066ec0000
+
+ /* x^250944 mod p(x), x^250880 mod p(x) */
+ .octa 0x00000000385c000000000000c87d0000
+
+ /* x^249920 mod p(x), x^249856 mod p(x) */
+ .octa 0x000000003227000000000000c8ff0000
+
+ /* x^248896 mod p(x), x^248832 mod p(x) */
+ .octa 0x00000000a9a900000000000033440000
+
+ /* x^247872 mod p(x), x^247808 mod p(x) */
+ .octa 0x00000000abaa00000000000066eb0000
+
+ /* x^246848 mod p(x), x^246784 mod p(x) */
+ .octa 0x000000001ac3000000000000c4ef0000
+
+ /* x^245824 mod p(x), x^245760 mod p(x) */
+ .octa 0x0000000063f000000000000056f30000
+
+ /* x^244800 mod p(x), x^244736 mod p(x) */
+ .octa 0x0000000032cc00000000000002050000
+
+ /* x^243776 mod p(x), x^243712 mod p(x) */
+ .octa 0x00000000f8b5000000000000568e0000
+
+ /* x^242752 mod p(x), x^242688 mod p(x) */
+ .octa 0x000000008db100000000000064290000
+
+ /* x^241728 mod p(x), x^241664 mod p(x) */
+ .octa 0x0000000059ca0000000000006b660000
+
+ /* x^240704 mod p(x), x^240640 mod p(x) */
+ .octa 0x000000005f5c00000000000018f80000
+
+ /* x^239680 mod p(x), x^239616 mod p(x) */
+ .octa 0x0000000061af000000000000b6090000
+
+ /* x^238656 mod p(x), x^238592 mod p(x) */
+ .octa 0x00000000e29e000000000000099a0000
+
+ /* x^237632 mod p(x), x^237568 mod p(x) */
+ .octa 0x000000000975000000000000a8360000
+
+ /* x^236608 mod p(x), x^236544 mod p(x) */
+ .octa 0x0000000043900000000000004f570000
+
+ /* x^235584 mod p(x), x^235520 mod p(x) */
+ .octa 0x00000000f9cd000000000000134c0000
+
+ /* x^234560 mod p(x), x^234496 mod p(x) */
+ .octa 0x000000007c29000000000000ec380000
+
+ /* x^233536 mod p(x), x^233472 mod p(x) */
+ .octa 0x000000004c6a000000000000b0d10000
+
+ /* x^232512 mod p(x), x^232448 mod p(x) */
+ .octa 0x00000000e7290000000000007d3e0000
+
+ /* x^231488 mod p(x), x^231424 mod p(x) */
+ .octa 0x00000000f1ab000000000000f0b20000
+
+ /* x^230464 mod p(x), x^230400 mod p(x) */
+ .octa 0x0000000039db0000000000009c270000
+
+ /* x^229440 mod p(x), x^229376 mod p(x) */
+ .octa 0x000000005e2800000000000092890000
+
+ /* x^228416 mod p(x), x^228352 mod p(x) */
+ .octa 0x00000000d44e000000000000d5ee0000
+
+ /* x^227392 mod p(x), x^227328 mod p(x) */
+ .octa 0x00000000cd0a00000000000041f50000
+
+ /* x^226368 mod p(x), x^226304 mod p(x) */
+ .octa 0x00000000c5b400000000000010520000
+
+ /* x^225344 mod p(x), x^225280 mod p(x) */
+ .octa 0x00000000fd2100000000000042170000
+
+ /* x^224320 mod p(x), x^224256 mod p(x) */
+ .octa 0x000000002f2500000000000095c20000
+
+ /* x^223296 mod p(x), x^223232 mod p(x) */
+ .octa 0x000000001b0100000000000001ce0000
+
+ /* x^222272 mod p(x), x^222208 mod p(x) */
+ .octa 0x000000000d430000000000002aca0000
+
+ /* x^221248 mod p(x), x^221184 mod p(x) */
+ .octa 0x0000000030a6000000000000385e0000
+
+ /* x^220224 mod p(x), x^220160 mod p(x) */
+ .octa 0x00000000e37b0000000000006f7a0000
+
+ /* x^219200 mod p(x), x^219136 mod p(x) */
+ .octa 0x00000000873600000000000024320000
+
+ /* x^218176 mod p(x), x^218112 mod p(x) */
+ .octa 0x00000000e9fb000000000000bd9c0000
+
+ /* x^217152 mod p(x), x^217088 mod p(x) */
+ .octa 0x000000003b9500000000000054bc0000
+
+ /* x^216128 mod p(x), x^216064 mod p(x) */
+ .octa 0x00000000133e000000000000a4660000
+
+ /* x^215104 mod p(x), x^215040 mod p(x) */
+ .octa 0x00000000784500000000000079930000
+
+ /* x^214080 mod p(x), x^214016 mod p(x) */
+ .octa 0x00000000b9800000000000001bb80000
+
+ /* x^213056 mod p(x), x^212992 mod p(x) */
+ .octa 0x00000000687600000000000024400000
+
+ /* x^212032 mod p(x), x^211968 mod p(x) */
+ .octa 0x00000000aff300000000000029e10000
+
+ /* x^211008 mod p(x), x^210944 mod p(x) */
+ .octa 0x0000000024b50000000000005ded0000
+
+ /* x^209984 mod p(x), x^209920 mod p(x) */
+ .octa 0x0000000017e8000000000000b12e0000
+
+ /* x^208960 mod p(x), x^208896 mod p(x) */
+ .octa 0x00000000128400000000000026d20000
+
+ /* x^207936 mod p(x), x^207872 mod p(x) */
+ .octa 0x000000002115000000000000a32a0000
+
+ /* x^206912 mod p(x), x^206848 mod p(x) */
+ .octa 0x000000009595000000000000a1210000
+
+ /* x^205888 mod p(x), x^205824 mod p(x) */
+ .octa 0x00000000281e000000000000ee8b0000
+
+ /* x^204864 mod p(x), x^204800 mod p(x) */
+ .octa 0x0000000006010000000000003d0d0000
+
+ /* x^203840 mod p(x), x^203776 mod p(x) */
+ .octa 0x00000000e2b600000000000034e90000
+
+ /* x^202816 mod p(x), x^202752 mod p(x) */
+ .octa 0x000000001bd40000000000004cdb0000
+
+ /* x^201792 mod p(x), x^201728 mod p(x) */
+ .octa 0x00000000df2800000000000030e90000
+
+ /* x^200768 mod p(x), x^200704 mod p(x) */
+ .octa 0x0000000049c200000000000042590000
+
+ /* x^199744 mod p(x), x^199680 mod p(x) */
+ .octa 0x000000009b97000000000000df950000
+
+ /* x^198720 mod p(x), x^198656 mod p(x) */
+ .octa 0x000000006184000000000000da7b0000
+
+ /* x^197696 mod p(x), x^197632 mod p(x) */
+ .octa 0x00000000461700000000000012510000
+
+ /* x^196672 mod p(x), x^196608 mod p(x) */
+ .octa 0x000000009b40000000000000f37e0000
+
+ /* x^195648 mod p(x), x^195584 mod p(x) */
+ .octa 0x00000000eeb2000000000000ecf10000
+
+ /* x^194624 mod p(x), x^194560 mod p(x) */
+ .octa 0x00000000b2e800000000000050f20000
+
+ /* x^193600 mod p(x), x^193536 mod p(x) */
+ .octa 0x00000000f59a000000000000e0b30000
+
+ /* x^192576 mod p(x), x^192512 mod p(x) */
+ .octa 0x00000000467f0000000000004d5a0000
+
+ /* x^191552 mod p(x), x^191488 mod p(x) */
+ .octa 0x00000000da92000000000000bb010000
+
+ /* x^190528 mod p(x), x^190464 mod p(x) */
+ .octa 0x000000001e1000000000000022a40000
+
+ /* x^189504 mod p(x), x^189440 mod p(x) */
+ .octa 0x0000000058fe000000000000836f0000
+
+ /* x^188480 mod p(x), x^188416 mod p(x) */
+ .octa 0x00000000b9ce000000000000d78d0000
+
+ /* x^187456 mod p(x), x^187392 mod p(x) */
+ .octa 0x0000000022210000000000004f8d0000
+
+ /* x^186432 mod p(x), x^186368 mod p(x) */
+ .octa 0x00000000744600000000000033760000
+
+ /* x^185408 mod p(x), x^185344 mod p(x) */
+ .octa 0x000000001c2e000000000000a1e50000
+
+ /* x^184384 mod p(x), x^184320 mod p(x) */
+ .octa 0x00000000dcc8000000000000a1a40000
+
+ /* x^183360 mod p(x), x^183296 mod p(x) */
+ .octa 0x00000000910f00000000000019a20000
+
+ /* x^182336 mod p(x), x^182272 mod p(x) */
+ .octa 0x0000000055d5000000000000f6ae0000
+
+ /* x^181312 mod p(x), x^181248 mod p(x) */
+ .octa 0x00000000c8ba000000000000a7ac0000
+
+ /* x^180288 mod p(x), x^180224 mod p(x) */
+ .octa 0x0000000031f8000000000000eea20000
+
+ /* x^179264 mod p(x), x^179200 mod p(x) */
+ .octa 0x000000001966000000000000c4d90000
+
+ /* x^178240 mod p(x), x^178176 mod p(x) */
+ .octa 0x00000000b9810000000000002b470000
+
+ /* x^177216 mod p(x), x^177152 mod p(x) */
+ .octa 0x000000008303000000000000f7cf0000
+
+ /* x^176192 mod p(x), x^176128 mod p(x) */
+ .octa 0x000000002ce500000000000035b30000
+
+ /* x^175168 mod p(x), x^175104 mod p(x) */
+ .octa 0x000000002fae0000000000000c7c0000
+
+ /* x^174144 mod p(x), x^174080 mod p(x) */
+ .octa 0x00000000f50c0000000000009edf0000
+
+ /* x^173120 mod p(x), x^173056 mod p(x) */
+ .octa 0x00000000714f00000000000004cd0000
+
+ /* x^172096 mod p(x), x^172032 mod p(x) */
+ .octa 0x00000000c161000000000000541b0000
+
+ /* x^171072 mod p(x), x^171008 mod p(x) */
+ .octa 0x0000000021c8000000000000e2700000
+
+ /* x^170048 mod p(x), x^169984 mod p(x) */
+ .octa 0x00000000b93d00000000000009a60000
+
+ /* x^169024 mod p(x), x^168960 mod p(x) */
+ .octa 0x00000000fbcf000000000000761c0000
+
+ /* x^168000 mod p(x), x^167936 mod p(x) */
+ .octa 0x0000000026350000000000009db30000
+
+ /* x^166976 mod p(x), x^166912 mod p(x) */
+ .octa 0x00000000b64f0000000000003e9f0000
+
+ /* x^165952 mod p(x), x^165888 mod p(x) */
+ .octa 0x00000000bd0e00000000000078590000
+
+ /* x^164928 mod p(x), x^164864 mod p(x) */
+ .octa 0x00000000d9360000000000008bc80000
+
+ /* x^163904 mod p(x), x^163840 mod p(x) */
+ .octa 0x000000002f140000000000008c9f0000
+
+ /* x^162880 mod p(x), x^162816 mod p(x) */
+ .octa 0x000000006a270000000000006af70000
+
+ /* x^161856 mod p(x), x^161792 mod p(x) */
+ .octa 0x000000006685000000000000e5210000
+
+ /* x^160832 mod p(x), x^160768 mod p(x) */
+ .octa 0x0000000062da00000000000008290000
+
+ /* x^159808 mod p(x), x^159744 mod p(x) */
+ .octa 0x00000000bb4b000000000000e4d00000
+
+ /* x^158784 mod p(x), x^158720 mod p(x) */
+ .octa 0x00000000d2490000000000004ae10000
+
+ /* x^157760 mod p(x), x^157696 mod p(x) */
+ .octa 0x00000000c85b00000000000000e70000
+
+ /* x^156736 mod p(x), x^156672 mod p(x) */
+ .octa 0x00000000c37a00000000000015650000
+
+ /* x^155712 mod p(x), x^155648 mod p(x) */
+ .octa 0x0000000018530000000000001c2f0000
+
+ /* x^154688 mod p(x), x^154624 mod p(x) */
+ .octa 0x00000000b46600000000000037bd0000
+
+ /* x^153664 mod p(x), x^153600 mod p(x) */
+ .octa 0x00000000439b00000000000012190000
+
+ /* x^152640 mod p(x), x^152576 mod p(x) */
+ .octa 0x00000000b1260000000000005ece0000
+
+ /* x^151616 mod p(x), x^151552 mod p(x) */
+ .octa 0x00000000d8110000000000002a5e0000
+
+ /* x^150592 mod p(x), x^150528 mod p(x) */
+ .octa 0x00000000099f00000000000052330000
+
+ /* x^149568 mod p(x), x^149504 mod p(x) */
+ .octa 0x00000000f9f9000000000000f9120000
+
+ /* x^148544 mod p(x), x^148480 mod p(x) */
+ .octa 0x000000005cc00000000000000ddc0000
+
+ /* x^147520 mod p(x), x^147456 mod p(x) */
+ .octa 0x00000000343b00000000000012200000
+
+ /* x^146496 mod p(x), x^146432 mod p(x) */
+ .octa 0x000000009222000000000000d12b0000
+
+ /* x^145472 mod p(x), x^145408 mod p(x) */
+ .octa 0x00000000d781000000000000eb2d0000
+
+ /* x^144448 mod p(x), x^144384 mod p(x) */
+ .octa 0x000000000bf400000000000058970000
+
+ /* x^143424 mod p(x), x^143360 mod p(x) */
+ .octa 0x00000000094200000000000013690000
+
+ /* x^142400 mod p(x), x^142336 mod p(x) */
+ .octa 0x00000000d55100000000000051950000
+
+ /* x^141376 mod p(x), x^141312 mod p(x) */
+ .octa 0x000000008f11000000000000954b0000
+
+ /* x^140352 mod p(x), x^140288 mod p(x) */
+ .octa 0x00000000140f000000000000b29e0000
+
+ /* x^139328 mod p(x), x^139264 mod p(x) */
+ .octa 0x00000000c6db000000000000db5d0000
+
+ /* x^138304 mod p(x), x^138240 mod p(x) */
+ .octa 0x00000000715b000000000000dfaf0000
+
+ /* x^137280 mod p(x), x^137216 mod p(x) */
+ .octa 0x000000000dea000000000000e3b60000
+
+ /* x^136256 mod p(x), x^136192 mod p(x) */
+ .octa 0x000000006f94000000000000ddaf0000
+
+ /* x^135232 mod p(x), x^135168 mod p(x) */
+ .octa 0x0000000024e1000000000000e4f70000
+
+ /* x^134208 mod p(x), x^134144 mod p(x) */
+ .octa 0x000000008810000000000000aa110000
+
+ /* x^133184 mod p(x), x^133120 mod p(x) */
+ .octa 0x0000000030c2000000000000a8e60000
+
+ /* x^132160 mod p(x), x^132096 mod p(x) */
+ .octa 0x00000000e6d0000000000000ccf30000
+
+ /* x^131136 mod p(x), x^131072 mod p(x) */
+ .octa 0x000000004da000000000000079bf0000
+
+ /* x^130112 mod p(x), x^130048 mod p(x) */
+ .octa 0x000000007759000000000000b3a30000
+
+ /* x^129088 mod p(x), x^129024 mod p(x) */
+ .octa 0x00000000597400000000000028790000
+
+ /* x^128064 mod p(x), x^128000 mod p(x) */
+ .octa 0x000000007acd000000000000b5820000
+
+ /* x^127040 mod p(x), x^126976 mod p(x) */
+ .octa 0x00000000e6e400000000000026ad0000
+
+ /* x^126016 mod p(x), x^125952 mod p(x) */
+ .octa 0x000000006d49000000000000985b0000
+
+ /* x^124992 mod p(x), x^124928 mod p(x) */
+ .octa 0x000000000f0800000000000011520000
+
+ /* x^123968 mod p(x), x^123904 mod p(x) */
+ .octa 0x000000002c7f000000000000846c0000
+
+ /* x^122944 mod p(x), x^122880 mod p(x) */
+ .octa 0x000000005ce7000000000000ae1d0000
+
+ /* x^121920 mod p(x), x^121856 mod p(x) */
+ .octa 0x00000000d4cb000000000000e21d0000
+
+ /* x^120896 mod p(x), x^120832 mod p(x) */
+ .octa 0x000000003a2300000000000019bb0000
+
+ /* x^119872 mod p(x), x^119808 mod p(x) */
+ .octa 0x000000000e1700000000000095290000
+
+ /* x^118848 mod p(x), x^118784 mod p(x) */
+ .octa 0x000000006e6400000000000050d20000
+
+ /* x^117824 mod p(x), x^117760 mod p(x) */
+ .octa 0x000000008d5c0000000000000cd10000
+
+ /* x^116800 mod p(x), x^116736 mod p(x) */
+ .octa 0x00000000ef310000000000007b570000
+
+ /* x^115776 mod p(x), x^115712 mod p(x) */
+ .octa 0x00000000645d00000000000053d60000
+
+ /* x^114752 mod p(x), x^114688 mod p(x) */
+ .octa 0x0000000018fc00000000000077510000
+
+ /* x^113728 mod p(x), x^113664 mod p(x) */
+ .octa 0x000000000cb3000000000000a7b70000
+
+ /* x^112704 mod p(x), x^112640 mod p(x) */
+ .octa 0x00000000991b000000000000d0780000
+
+ /* x^111680 mod p(x), x^111616 mod p(x) */
+ .octa 0x00000000845a000000000000be3c0000
+
+ /* x^110656 mod p(x), x^110592 mod p(x) */
+ .octa 0x00000000d3a9000000000000df020000
+
+ /* x^109632 mod p(x), x^109568 mod p(x) */
+ .octa 0x0000000017d7000000000000063e0000
+
+ /* x^108608 mod p(x), x^108544 mod p(x) */
+ .octa 0x000000007a860000000000008ab40000
+
+ /* x^107584 mod p(x), x^107520 mod p(x) */
+ .octa 0x00000000fd7c000000000000c7bd0000
+
+ /* x^106560 mod p(x), x^106496 mod p(x) */
+ .octa 0x00000000a56b000000000000efd60000
+
+ /* x^105536 mod p(x), x^105472 mod p(x) */
+ .octa 0x0000000010e400000000000071380000
+
+ /* x^104512 mod p(x), x^104448 mod p(x) */
+ .octa 0x00000000994500000000000004d30000
+
+ /* x^103488 mod p(x), x^103424 mod p(x) */
+ .octa 0x00000000b83c0000000000003b0e0000
+
+ /* x^102464 mod p(x), x^102400 mod p(x) */
+ .octa 0x00000000d6c10000000000008b020000
+
+ /* x^101440 mod p(x), x^101376 mod p(x) */
+ .octa 0x000000009efc000000000000da940000
+
+ /* x^100416 mod p(x), x^100352 mod p(x) */
+ .octa 0x000000005e87000000000000f9f70000
+
+ /* x^99392 mod p(x), x^99328 mod p(x) */
+ .octa 0x000000006c9b00000000000045e40000
+
+ /* x^98368 mod p(x), x^98304 mod p(x) */
+ .octa 0x00000000178a00000000000083940000
+
+ /* x^97344 mod p(x), x^97280 mod p(x) */
+ .octa 0x00000000f0c8000000000000f0a00000
+
+ /* x^96320 mod p(x), x^96256 mod p(x) */
+ .octa 0x00000000f699000000000000b74b0000
+
+ /* x^95296 mod p(x), x^95232 mod p(x) */
+ .octa 0x00000000316d000000000000c1cf0000
+
+ /* x^94272 mod p(x), x^94208 mod p(x) */
+ .octa 0x00000000987e00000000000072680000
+
+ /* x^93248 mod p(x), x^93184 mod p(x) */
+ .octa 0x00000000acff000000000000e0ab0000
+
+ /* x^92224 mod p(x), x^92160 mod p(x) */
+ .octa 0x00000000a1f6000000000000c5a80000
+
+ /* x^91200 mod p(x), x^91136 mod p(x) */
+ .octa 0x0000000061bd000000000000cf690000
+
+ /* x^90176 mod p(x), x^90112 mod p(x) */
+ .octa 0x00000000c9f2000000000000cbcc0000
+
+ /* x^89152 mod p(x), x^89088 mod p(x) */
+ .octa 0x000000005a33000000000000de050000
+
+ /* x^88128 mod p(x), x^88064 mod p(x) */
+ .octa 0x00000000e416000000000000ccd70000
+
+ /* x^87104 mod p(x), x^87040 mod p(x) */
+ .octa 0x0000000058930000000000002f670000
+
+ /* x^86080 mod p(x), x^86016 mod p(x) */
+ .octa 0x00000000a9d3000000000000152f0000
+
+ /* x^85056 mod p(x), x^84992 mod p(x) */
+ .octa 0x00000000c114000000000000ecc20000
+
+ /* x^84032 mod p(x), x^83968 mod p(x) */
+ .octa 0x00000000b9270000000000007c890000
+
+ /* x^83008 mod p(x), x^82944 mod p(x) */
+ .octa 0x000000002e6000000000000006ee0000
+
+ /* x^81984 mod p(x), x^81920 mod p(x) */
+ .octa 0x00000000dfc600000000000009100000
+
+ /* x^80960 mod p(x), x^80896 mod p(x) */
+ .octa 0x000000004911000000000000ad4e0000
+
+ /* x^79936 mod p(x), x^79872 mod p(x) */
+ .octa 0x00000000ae1b000000000000b04d0000
+
+ /* x^78912 mod p(x), x^78848 mod p(x) */
+ .octa 0x0000000005fa000000000000e9900000
+
+ /* x^77888 mod p(x), x^77824 mod p(x) */
+ .octa 0x0000000004a1000000000000cc6f0000
+
+ /* x^76864 mod p(x), x^76800 mod p(x) */
+ .octa 0x00000000af73000000000000ed110000
+
+ /* x^75840 mod p(x), x^75776 mod p(x) */
+ .octa 0x0000000082530000000000008f7e0000
+
+ /* x^74816 mod p(x), x^74752 mod p(x) */
+ .octa 0x00000000cfdc000000000000594f0000
+
+ /* x^73792 mod p(x), x^73728 mod p(x) */
+ .octa 0x00000000a6b6000000000000a8750000
+
+ /* x^72768 mod p(x), x^72704 mod p(x) */
+ .octa 0x00000000fd76000000000000aa0c0000
+
+ /* x^71744 mod p(x), x^71680 mod p(x) */
+ .octa 0x0000000006f500000000000071db0000
+
+ /* x^70720 mod p(x), x^70656 mod p(x) */
+ .octa 0x0000000037ca000000000000ab0c0000
+
+ /* x^69696 mod p(x), x^69632 mod p(x) */
+ .octa 0x00000000d7ab000000000000b7a00000
+
+ /* x^68672 mod p(x), x^68608 mod p(x) */
+ .octa 0x00000000440800000000000090d30000
+
+ /* x^67648 mod p(x), x^67584 mod p(x) */
+ .octa 0x00000000186100000000000054730000
+
+ /* x^66624 mod p(x), x^66560 mod p(x) */
+ .octa 0x000000007368000000000000a3a20000
+
+ /* x^65600 mod p(x), x^65536 mod p(x) */
+ .octa 0x0000000026d0000000000000f9040000
+
+ /* x^64576 mod p(x), x^64512 mod p(x) */
+ .octa 0x00000000fe770000000000009c0a0000
+
+ /* x^63552 mod p(x), x^63488 mod p(x) */
+ .octa 0x000000002cba000000000000d1e70000
+
+ /* x^62528 mod p(x), x^62464 mod p(x) */
+ .octa 0x00000000f8bd0000000000005ac10000
+
+ /* x^61504 mod p(x), x^61440 mod p(x) */
+ .octa 0x000000007372000000000000d68d0000
+
+ /* x^60480 mod p(x), x^60416 mod p(x) */
+ .octa 0x00000000f37f00000000000089f60000
+
+ /* x^59456 mod p(x), x^59392 mod p(x) */
+ .octa 0x00000000078400000000000008a90000
+
+ /* x^58432 mod p(x), x^58368 mod p(x) */
+ .octa 0x00000000d3e400000000000042360000
+
+ /* x^57408 mod p(x), x^57344 mod p(x) */
+ .octa 0x00000000eba800000000000092d50000
+
+ /* x^56384 mod p(x), x^56320 mod p(x) */
+ .octa 0x00000000afbe000000000000b4d50000
+
+ /* x^55360 mod p(x), x^55296 mod p(x) */
+ .octa 0x00000000d8ca000000000000c9060000
+
+ /* x^54336 mod p(x), x^54272 mod p(x) */
+ .octa 0x00000000c2d00000000000008f4f0000
+
+ /* x^53312 mod p(x), x^53248 mod p(x) */
+ .octa 0x00000000373200000000000028690000
+
+ /* x^52288 mod p(x), x^52224 mod p(x) */
+ .octa 0x0000000046ae000000000000c3b30000
+
+ /* x^51264 mod p(x), x^51200 mod p(x) */
+ .octa 0x00000000b243000000000000f8700000
+
+ /* x^50240 mod p(x), x^50176 mod p(x) */
+ .octa 0x00000000f7f500000000000029eb0000
+
+ /* x^49216 mod p(x), x^49152 mod p(x) */
+ .octa 0x000000000c7e000000000000fe730000
+
+ /* x^48192 mod p(x), x^48128 mod p(x) */
+ .octa 0x00000000c38200000000000096000000
+
+ /* x^47168 mod p(x), x^47104 mod p(x) */
+ .octa 0x000000008956000000000000683c0000
+
+ /* x^46144 mod p(x), x^46080 mod p(x) */
+ .octa 0x00000000422d0000000000005f1e0000
+
+ /* x^45120 mod p(x), x^45056 mod p(x) */
+ .octa 0x00000000ac0f0000000000006f810000
+
+ /* x^44096 mod p(x), x^44032 mod p(x) */
+ .octa 0x00000000ce30000000000000031f0000
+
+ /* x^43072 mod p(x), x^43008 mod p(x) */
+ .octa 0x000000003d43000000000000455a0000
+
+ /* x^42048 mod p(x), x^41984 mod p(x) */
+ .octa 0x000000007ebe000000000000a6050000
+
+ /* x^41024 mod p(x), x^40960 mod p(x) */
+ .octa 0x00000000976e00000000000077eb0000
+
+ /* x^40000 mod p(x), x^39936 mod p(x) */
+ .octa 0x000000000872000000000000389c0000
+
+ /* x^38976 mod p(x), x^38912 mod p(x) */
+ .octa 0x000000008979000000000000c7b20000
+
+ /* x^37952 mod p(x), x^37888 mod p(x) */
+ .octa 0x000000005c1e0000000000001d870000
+
+ /* x^36928 mod p(x), x^36864 mod p(x) */
+ .octa 0x00000000aebb00000000000045810000
+
+ /* x^35904 mod p(x), x^35840 mod p(x) */
+ .octa 0x000000004f7e0000000000006d4a0000
+
+ /* x^34880 mod p(x), x^34816 mod p(x) */
+ .octa 0x00000000ea98000000000000b9200000
+
+ /* x^33856 mod p(x), x^33792 mod p(x) */
+ .octa 0x00000000f39600000000000022f20000
+
+ /* x^32832 mod p(x), x^32768 mod p(x) */
+ .octa 0x000000000bc500000000000041ca0000
+
+ /* x^31808 mod p(x), x^31744 mod p(x) */
+ .octa 0x00000000786400000000000078500000
+
+ /* x^30784 mod p(x), x^30720 mod p(x) */
+ .octa 0x00000000be970000000000009e7e0000
+
+ /* x^29760 mod p(x), x^29696 mod p(x) */
+ .octa 0x00000000dd6d000000000000a53c0000
+
+ /* x^28736 mod p(x), x^28672 mod p(x) */
+ .octa 0x000000004c3f00000000000039340000
+
+ /* x^27712 mod p(x), x^27648 mod p(x) */
+ .octa 0x0000000093a4000000000000b58e0000
+
+ /* x^26688 mod p(x), x^26624 mod p(x) */
+ .octa 0x0000000050fb00000000000062d40000
+
+ /* x^25664 mod p(x), x^25600 mod p(x) */
+ .octa 0x00000000f505000000000000a26f0000
+
+ /* x^24640 mod p(x), x^24576 mod p(x) */
+ .octa 0x0000000064f900000000000065e60000
+
+ /* x^23616 mod p(x), x^23552 mod p(x) */
+ .octa 0x00000000e8c2000000000000aad90000
+
+ /* x^22592 mod p(x), x^22528 mod p(x) */
+ .octa 0x00000000720b000000000000a3b00000
+
+ /* x^21568 mod p(x), x^21504 mod p(x) */
+ .octa 0x00000000e992000000000000d2680000
+
+ /* x^20544 mod p(x), x^20480 mod p(x) */
+ .octa 0x000000009132000000000000cf4c0000
+
+ /* x^19520 mod p(x), x^19456 mod p(x) */
+ .octa 0x00000000608a00000000000076610000
+
+ /* x^18496 mod p(x), x^18432 mod p(x) */
+ .octa 0x000000009948000000000000fb9f0000
+
+ /* x^17472 mod p(x), x^17408 mod p(x) */
+ .octa 0x00000000173000000000000003770000
+
+ /* x^16448 mod p(x), x^16384 mod p(x) */
+ .octa 0x000000006fe300000000000004880000
+
+ /* x^15424 mod p(x), x^15360 mod p(x) */
+ .octa 0x00000000e15300000000000056a70000
+
+ /* x^14400 mod p(x), x^14336 mod p(x) */
+ .octa 0x0000000092d60000000000009dfd0000
+
+ /* x^13376 mod p(x), x^13312 mod p(x) */
+ .octa 0x0000000002fd00000000000074c80000
+
+ /* x^12352 mod p(x), x^12288 mod p(x) */
+ .octa 0x00000000c78b000000000000a3ec0000
+
+ /* x^11328 mod p(x), x^11264 mod p(x) */
+ .octa 0x000000009262000000000000b3530000
+
+ /* x^10304 mod p(x), x^10240 mod p(x) */
+ .octa 0x0000000084f200000000000047bf0000
+
+ /* x^9280 mod p(x), x^9216 mod p(x) */
+ .octa 0x0000000067ee000000000000e97c0000
+
+ /* x^8256 mod p(x), x^8192 mod p(x) */
+ .octa 0x00000000535b00000000000091e10000
+
+ /* x^7232 mod p(x), x^7168 mod p(x) */
+ .octa 0x000000007ebb00000000000055060000
+
+ /* x^6208 mod p(x), x^6144 mod p(x) */
+ .octa 0x00000000c6a1000000000000fd360000
+
+ /* x^5184 mod p(x), x^5120 mod p(x) */
+ .octa 0x000000001be500000000000055860000
+
+ /* x^4160 mod p(x), x^4096 mod p(x) */
+ .octa 0x00000000ae0e0000000000005bd00000
+
+ /* x^3136 mod p(x), x^3072 mod p(x) */
+ .octa 0x0000000022040000000000008db20000
+
+ /* x^2112 mod p(x), x^2048 mod p(x) */
+ .octa 0x00000000c9eb000000000000efe20000
+
+ /* x^1088 mod p(x), x^1024 mod p(x) */
+ .octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+ .octa 0xefe20000dccf00009440000033590000
+
+ /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+ .octa 0xee6300002f3f000062180000e0ed0000
+
+ /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+ .octa 0xcf5f000017ef0000ccbe000023d30000
+
+ /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+ .octa 0x6d0c0000a30e00000920000042630000
+
+ /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+ .octa 0x21d30000932b0000a7a00000efcc0000
+
+ /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+ .octa 0x10be00000b310000666f00000d1c0000
+
+ /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+ .octa 0x1f240000ce9e0000caad0000589e0000
+
+ /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+ .octa 0x29610000d02b000039b400007cf50000
+
+ /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+ .octa 0x51d100009d9d00003c0e0000bfd60000
+
+ /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+ .octa 0xda390000ceae000013830000713c0000
+
+ /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+ .octa 0xb67800001e16000085c0000080a60000
+
+ /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+ .octa 0x0db40000f7f90000371d0000e6580000
+
+ /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+ .octa 0x87e70000044c0000aadb0000a4970000
+
+ /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+ .octa 0x1f990000ad180000d8b30000e7b50000
+
+ /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+ .octa 0xbe6c00006ee300004c1a000006df0000
+
+ /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+ .octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
+ /* Barrett constant n */
+ .octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
new file mode 100644
index 000000000..02ea27786
--- /dev/null
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
@@ -0,0 +1,128 @@
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 64
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+ u32 crc = crci;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+ return crc_t10dif_generic(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc_t10dif_generic(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ crc <<= 16;
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ crc >>= 16;
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc_t10dif_generic(crc, p, tail);
+ }
+
+ return crc & 0xffff;
+}
+
+static int crct10dif_vpmsum_init(struct shash_desc *desc)
+{
+ u16 *crc = shash_desc_ctx(desc);
+
+ *crc = 0;
+ return 0;
+}
+
+static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crc = shash_desc_ctx(desc);
+
+ *crc = crct10dif_vpmsum(*crc, data, length);
+
+ return 0;
+}
+
+
+static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+
+ *(u16 *)out = *crcp;
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .init = crct10dif_vpmsum_init,
+ .update = crct10dif_vpmsum_update,
+ .final = crct10dif_vpmsum_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-vpmsum",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init crct10dif_vpmsum_mod_init(void)
+{
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_vpmsum_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
+module_exit(crct10dif_vpmsum_mod_fini);
+
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S
new file mode 100644
index 000000000..183406536
--- /dev/null
+++ b/arch/powerpc/crypto/md5-asm.S
@@ -0,0 +1,244 @@
+/*
+ * Fast MD5 implementation for PPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#define rHP r3
+#define rWP r4
+
+#define rH0 r0
+#define rH1 r6
+#define rH2 r7
+#define rH3 r5
+
+#define rW00 r8
+#define rW01 r9
+#define rW02 r10
+#define rW03 r11
+#define rW04 r12
+#define rW05 r14
+#define rW06 r15
+#define rW07 r16
+#define rW08 r17
+#define rW09 r18
+#define rW10 r19
+#define rW11 r20
+#define rW12 r21
+#define rW13 r22
+#define rW14 r23
+#define rW15 r24
+
+#define rT0 r25
+#define rT1 r26
+
+#define INITIALIZE \
+ PPC_STLU r1,-INT_FRAME_SIZE(r1); \
+ SAVE_8GPRS(14, r1); /* push registers onto stack */ \
+ SAVE_4GPRS(22, r1); \
+ SAVE_GPR(26, r1)
+
+#define FINALIZE \
+ REST_8GPRS(14, r1); /* pop registers from stack */ \
+ REST_4GPRS(22, r1); \
+ REST_GPR(26, r1); \
+ addi r1,r1,INT_FRAME_SIZE;
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */
+#define INC_PTR \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#else
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define INC_PTR /* nothing to do */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#endif
+
+#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
+ LOAD_DATA(w0, off) /* W */ \
+ and rT0,b,c; /* 1: f = b and c */ \
+ INC_PTR /* ptr++ */ \
+ andc rT1,d,b; /* 1: f' = ~b and d */ \
+ LOAD_DATA(w1, off+4) /* W */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ addis w1,w1,k1h; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ and rT0,a,b; /* 2: f = b and c */ \
+ andc rT1,c,a; /* 2: f' = ~b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ INC_PTR /* ptr++ */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ andc rT0,c,d; /* 1: f = c and ~d */ \
+ and rT1,b,d; /* 1: f' = b and d */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ andc rT0,b,c; /* 2: f = c and ~d */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ and rT1,a,c; /* 2: f' = b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a +b */
+
+#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ xor rT0,b,c; /* 1: f' = b xor c */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ xor rT1,rT0,d; /* 1: f = f xor f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT1; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ xor rT1,rT0,a; /* 2: f = b xor f' */ \
+ add d,d,rT1; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ addi w0,w0,k0l; /* 1: w = w + k */ \
+ orc rT0,b,d; /* 1: f = b or ~d */ \
+ addis w0,w0,k0h; /* 1: w = w + k' */ \
+ xor rT0,rT0,c; /* 1: f = f xor c */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: w = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w1,w1,k1h; /* 2: w = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ orc rT0,a,c; /* 2: f = b or ~d */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ xor rT0,rT0,b; /* 2: f = f xor c */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+_GLOBAL(ppc_md5_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+
+ppc_md5_main:
+ R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
+ 0xd76b, -23432, 0xe8c8, -18602)
+ R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
+ 0x2420, 0x70db, 0xc1be, -12562)
+ R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
+ 0xf57c, 0x0faf, 0x4788, -14806)
+ R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
+ 0xa830, 0x4613, 0xfd47, -27391)
+ R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
+ 0x6981, -26408, 0x8b45, -2129)
+ R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
+ 0xffff, 0x5bb1, 0x895d, -10306)
+ R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
+ 0x6b90, 0x1122, 0xfd98, 0x7193)
+ R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
+ 0xa679, 0x438e, 0x49b4, 0x0821)
+
+ R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
+ 0x0d56, 0x6e0c, 0x1810, 0x6d2d)
+ R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
+ 0x9d02, -32109, 0x124c, 0x2332)
+ R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
+ 0x8ea7, 0x4a33, 0x0245, -18270)
+ R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
+ 0x8eee, -8608, 0xf258, -5095)
+ R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
+ 0x969d, -10697, 0x1cbe, -15288)
+ R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
+ 0x3317, 0x3e99, 0xdbd9, 0x7c15)
+ R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
+ 0xac4b, 0x7772, 0xd8cf, 0x331d)
+ R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
+ 0x6a28, 0x6dd8, 0x219a, 0x3b68)
+
+ R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
+ 0x29cb, 0x28e5, 0x4218, -7788)
+ R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16, 9,
+ 0x473f, 0x06d1, 0x3aae, 0x3036)
+ R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
+ 0xaea1, -15134, 0x640b, -11295)
+ R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16, 9,
+ 0x8f4c, 0x4887, 0xbc7c, -22499)
+ R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
+ 0x7eb8, -27199, 0x00ea, 0x6050)
+ R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16, 9,
+ 0xe01a, 0x22fe, 0x4447, 0x69c5)
+ R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
+ 0xb7f3, 0x0253, 0x59b1, 0x4d5b)
+ R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16, 9,
+ 0x4701, -27017, 0xc7bd, -19859)
+
+ R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
+ 0x0988, -1462, 0x4c70, -19401)
+ R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
+ 0xadaf, -5221, 0xfc99, 0x66f7)
+ R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
+ 0x7e80, -16418, 0xba1e, -25587)
+ R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
+ 0x4130, 0x380d, 0xe0c5, 0x738d)
+ lwz rW00,0(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
+ 0xe837, -30770, 0xde8a, 0x69e8)
+ lwz rW14,4(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
+ 0x9e79, 0x260f, 0x256d, -27941)
+ lwz rW12,8(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
+ 0xab75, -20775, 0x4f9e, -28397)
+ lwz rW10,12(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
+ 0x662b, 0x7c56, 0x11b2, 0x0358)
+
+ add rH0,rH0,rW00
+ stw rH0,0(rHP)
+ add rH1,rH1,rW14
+ stw rH1,4(rHP)
+ add rH2,rH2,rW12
+ stw rH2,8(rHP)
+ add rH3,rH3,rW10
+ stw rH3,12(rHP)
+ NEXT_BLOCK
+
+ bdnz ppc_md5_main
+
+ FINALIZE
+ blr
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
new file mode 100644
index 000000000..7e44cec37
--- /dev/null
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -0,0 +1,164 @@
+/*
+ * Glue code for MD5 implementation for PPC assembler
+ *
+ * Based on generic implementation.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/md5.h>
+#include <asm/byteorder.h>
+
+extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
+
+static inline void ppc_md5_clear_context(struct md5_state *sctx)
+{
+ int count = sizeof(struct md5_state) >> 2;
+ u32 *ptr = (u32 *)sctx;
+
+ /* make sure we can clear the fast way */
+ BUILD_BUG_ON(sizeof(struct md5_state) % 4);
+ do { *ptr++ = 0; } while (--count);
+}
+
+static int ppc_md5_init(struct shash_desc *desc)
+{
+ struct md5_state *sctx = shash_desc_ctx(desc);
+
+ sctx->hash[0] = MD5_H0;
+ sctx->hash[1] = MD5_H1;
+ sctx->hash[2] = MD5_H2;
+ sctx->hash[3] = MD5_H3;
+ sctx->byte_count = 0;
+
+ return 0;
+}
+
+static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct md5_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->byte_count & 0x3f;
+ unsigned int avail = 64 - offset;
+ const u8 *src = data;
+
+ sctx->byte_count += len;
+
+ if (avail > len) {
+ memcpy((char *)sctx->block + offset, src, len);
+ return 0;
+ }
+
+ if (offset) {
+ memcpy((char *)sctx->block + offset, src, avail);
+ ppc_md5_transform(sctx->hash, (const u8 *)sctx->block, 1);
+ len -= avail;
+ src += avail;
+ }
+
+ if (len > 63) {
+ ppc_md5_transform(sctx->hash, src, len >> 6);
+ src += len & ~0x3f;
+ len &= 0x3f;
+ }
+
+ memcpy((char *)sctx->block, src, len);
+ return 0;
+}
+
+static int ppc_md5_final(struct shash_desc *desc, u8 *out)
+{
+ struct md5_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->byte_count & 0x3f;
+ const u8 *src = (const u8 *)sctx->block;
+ u8 *p = (u8 *)src + offset;
+ int padlen = 55 - offset;
+ __le64 *pbits = (__le64 *)((char *)sctx->block + 56);
+ __le32 *dst = (__le32 *)out;
+
+ *p++ = 0x80;
+
+ if (padlen < 0) {
+ memset(p, 0x00, padlen + sizeof (u64));
+ ppc_md5_transform(sctx->hash, src, 1);
+ p = (char *)sctx->block;
+ padlen = 56;
+ }
+
+ memset(p, 0, padlen);
+ *pbits = cpu_to_le64(sctx->byte_count << 3);
+ ppc_md5_transform(sctx->hash, src, 1);
+
+ dst[0] = cpu_to_le32(sctx->hash[0]);
+ dst[1] = cpu_to_le32(sctx->hash[1]);
+ dst[2] = cpu_to_le32(sctx->hash[2]);
+ dst[3] = cpu_to_le32(sctx->hash[3]);
+
+ ppc_md5_clear_context(sctx);
+ return 0;
+}
+
+static int ppc_md5_export(struct shash_desc *desc, void *out)
+{
+ struct md5_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+ return 0;
+}
+
+static int ppc_md5_import(struct shash_desc *desc, const void *in)
+{
+ struct md5_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = MD5_DIGEST_SIZE,
+ .init = ppc_md5_init,
+ .update = ppc_md5_update,
+ .final = ppc_md5_final,
+ .export = ppc_md5_export,
+ .import = ppc_md5_import,
+ .descsize = sizeof(struct md5_state),
+ .statesize = sizeof(struct md5_state),
+ .base = {
+ .cra_name = "md5",
+ .cra_driver_name= "md5-ppc",
+ .cra_priority = 200,
+ .cra_blocksize = MD5_HMAC_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init ppc_md5_mod_init(void)
+{
+ return crypto_register_shash(&alg);
+}
+
+static void __exit ppc_md5_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(ppc_md5_mod_init);
+module_exit(ppc_md5_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler");
+
+MODULE_ALIAS_CRYPTO("md5");
+MODULE_ALIAS_CRYPTO("md5-ppc");
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S
new file mode 100644
index 000000000..23e248bef
--- /dev/null
+++ b/arch/powerpc/crypto/sha1-powerpc-asm.S
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra) \
+ lwz rt,d(ra)
+#else
+#define LWZ(rt, d, ra) \
+ li rt,d; \
+ lwbrx rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t) ((((t)+5)%6)+7)
+#define RA(t) ((((t)+4)%6)+7)
+#define RB(t) ((((t)+3)%6)+7)
+#define RC(t) ((((t)+2)%6)+7)
+#define RD(t) ((((t)+1)%6)+7)
+#define RE(t) ((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t) (((t)%16)+16)
+
+#define LOADW(t) \
+ LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t) \
+ andc r0,RD(t),RB(t); \
+ and r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r14,r0,W(t); \
+ LWZ(W((t)+4),((t)+4)*4,r4); \
+ rotlwi RB(t),RB(t),30; \
+ add RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ andc r0,RD(t),RB(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD1(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ add RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ and r0,RB(t),RD(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ rotlwi RB(t),RB(t),30; \
+ and r0,RC(t),RD(t); \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ or r6,r6,r0; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t) \
+ STEPD0_LOAD(t); \
+ STEPD0_LOAD((t)+1); \
+ STEPD0_LOAD((t)+2); \
+ STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn) \
+ STEP##fn##_UPDATE(t); \
+ STEP##fn##_UPDATE((t)+1); \
+ STEP##fn##_UPDATE((t)+2); \
+ STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn) \
+ STEPUP4(t, fn); \
+ STEPUP4((t)+4, fn); \
+ STEPUP4((t)+8, fn); \
+ STEPUP4((t)+12, fn); \
+ STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+ PPC_STLU r1,-INT_FRAME_SIZE(r1)
+ SAVE_8GPRS(14, r1)
+ SAVE_10GPRS(22, r1)
+
+ /* Load up A - E */
+ lwz RA(0),0(r3) /* A */
+ lwz RB(0),4(r3) /* B */
+ lwz RC(0),8(r3) /* C */
+ lwz RD(0),12(r3) /* D */
+ lwz RE(0),16(r3) /* E */
+
+ LOADW(0)
+ LOADW(1)
+ LOADW(2)
+ LOADW(3)
+
+ lis r15,0x5a82 /* K0-19 */
+ ori r15,r15,0x7999
+ STEP0LD4(0)
+ STEP0LD4(4)
+ STEP0LD4(8)
+ STEPUP4(12, D0)
+ STEPUP4(16, D0)
+
+ lis r15,0x6ed9 /* K20-39 */
+ ori r15,r15,0xeba1
+ STEPUP20(20, D1)
+
+ lis r15,0x8f1b /* K40-59 */
+ ori r15,r15,0xbcdc
+ STEPUP20(40, D2)
+
+ lis r15,0xca62 /* K60-79 */
+ ori r15,r15,0xc1d6
+ STEPUP4(60, D1)
+ STEPUP4(64, D1)
+ STEPUP4(68, D1)
+ STEPUP4(72, D1)
+ lwz r20,16(r3)
+ STEPD1(76)
+ lwz r19,12(r3)
+ STEPD1(77)
+ lwz r18,8(r3)
+ STEPD1(78)
+ lwz r17,4(r3)
+ STEPD1(79)
+
+ lwz r16,0(r3)
+ add r20,RE(80),r20
+ add RD(0),RD(80),r19
+ add RC(0),RC(80),r18
+ add RB(0),RB(80),r17
+ add RA(0),RA(80),r16
+ mr RE(0),r20
+ stw RA(0),0(r3)
+ stw RB(0),4(r3)
+ stw RC(0),8(r3)
+ stw RD(0),12(r3)
+ stw RE(0),16(r3)
+
+ REST_8GPRS(14, r1)
+ REST_10GPRS(22, r1)
+ addi r1,r1,INT_FRAME_SIZE
+ blr
diff --git a/arch/powerpc/crypto/sha1-spe-asm.S b/arch/powerpc/crypto/sha1-spe-asm.S
new file mode 100644
index 000000000..fcb6cf002
--- /dev/null
+++ b/arch/powerpc/crypto/sha1-spe-asm.S
@@ -0,0 +1,299 @@
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash value */
+#define rWP r4 /* pointer to input */
+#define rKP r5 /* pointer to constants */
+
+#define rW0 r14 /* 64 bit round words */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rH0 r6 /* 32 bit hash values */
+#define rH1 r7
+#define rH2 r8
+#define rH3 r9
+#define rH4 r10
+
+#define rT0 r22 /* 64 bit temporary */
+#define rT1 r0 /* 32 bit temporaries */
+#define rT2 r11
+#define rT3 r12
+
+#define rK r23 /* 64 bit constant in volatile register */
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+ evlwwsplat rK,0(rKP);
+
+#define LOAD_K21 \
+ evlwwsplat rK,4(rKP);
+
+#define LOAD_K31 \
+ evlwwsplat rK,8(rKP);
+
+#define LOAD_K41 \
+ evlwwsplat rK,12(rKP);
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* were already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
+ LOAD_DATA(w0, off) /* 1: W */ \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ LOAD_K##k##1 \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ add e,e,rT0; /* 1: E = E + A' */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,w0; /* 1: E = E + W */ \
+ LOAD_DATA(w1, off+4) /* 2: W */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ and rT1,a,b; /* 2: F' = B and C */ \
+ add e,e,rK; /* 1: E = E + K */ \
+ andc rT2,c,a; /* 2: F" = ~B and D */ \
+ add d,d,rK; /* 2: E = E + K */ \
+ or rT2,rT2,rT1; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,w1; /* 2: E = E + W */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
+ add d,d,rT2 /* 2: E = E + F */
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ or rT1,rT1,rT2; /* 1: F = F' or F" */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT1; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ andc rT1,c,a; /* 2: F" = ~B and D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ or rT1,rT1,rT2; /* 2: F = F' or F" */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT1 /* 2: E = E + F */
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ xor rT2,b,c; /* 1: F' = B xor C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ xor rT2,rT2,d; /* 1: F = F' xor D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ xor rT2,a,b; /* 2: F' = B xor C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ xor rT2,rT2,c; /* 2: F = F' xor D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ or rT1,b,c; /* 1: F" = B or C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ and rT1,d,rT1; /* 1: F" = F" and D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ or rT0,a,b; /* 2: F" = B or C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT0,c,rT0; /* 2: F" = F" and D */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ or rT2,rT2,rT0; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+ INITIALIZE
+
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ mtctr r5
+ lwz rH2,8(rHP)
+ lis rKP,PPC_SPE_SHA1_K@h
+ lwz rH3,12(rHP)
+ ori rKP,rKP,PPC_SPE_SHA1_K@l
+ lwz rH4,16(rHP)
+
+ppc_spe_sha1_main:
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+ R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+ R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+ R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+ R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+ lwz rT3,0(rHP)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+ lwz rW1,4(rHP)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+ lwz rW2,8(rHP)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+ lwz rW3,12(rHP)
+ NEXT_BLOCK
+ lwz rW4,16(rHP)
+
+ add rH0,rH0,rT3
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+
+ bdnz ppc_spe_sha1_main
+
+ FINALIZE
+ blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+ .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
new file mode 100644
index 000000000..9e1814d99
--- /dev/null
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -0,0 +1,210 @@
+/*
+ * Glue code for SHA-1 implementation for SPE instructions (PPC)
+ *
+ * Based on generic implementation.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/switch_to.h>
+#include <linux/hardirq.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+extern void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static inline void ppc_sha1_clear_context(struct sha1_state *sctx)
+{
+ int count = sizeof(struct sha1_state) >> 2;
+ u32 *ptr = (u32 *)sctx;
+
+ /* make sure we can clear the fast way */
+ BUILD_BUG_ON(sizeof(struct sha1_state) % 4);
+ do { *ptr++ = 0; } while (--count);
+}
+
+static int ppc_spe_sha1_init(struct shash_desc *desc)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA1_H0;
+ sctx->state[1] = SHA1_H1;
+ sctx->state[2] = SHA1_H2;
+ sctx->state[3] = SHA1_H3;
+ sctx->state[4] = SHA1_H4;
+ sctx->count = 0;
+
+ return 0;
+}
+
+static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->count & 0x3f;
+ const unsigned int avail = 64 - offset;
+ unsigned int bytes;
+ const u8 *src = data;
+
+ if (avail > len) {
+ sctx->count += len;
+ memcpy((char *)sctx->buffer + offset, src, len);
+ return 0;
+ }
+
+ sctx->count += len;
+
+ if (offset) {
+ memcpy((char *)sctx->buffer + offset, src, avail);
+
+ spe_begin();
+ ppc_spe_sha1_transform(sctx->state, (const u8 *)sctx->buffer, 1);
+ spe_end();
+
+ len -= avail;
+ src += avail;
+ }
+
+ while (len > 63) {
+ bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
+ bytes = bytes & ~0x3f;
+
+ spe_begin();
+ ppc_spe_sha1_transform(sctx->state, src, bytes >> 6);
+ spe_end();
+
+ src += bytes;
+ len -= bytes;
+ };
+
+ memcpy((char *)sctx->buffer, src, len);
+ return 0;
+}
+
+static int ppc_spe_sha1_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->count & 0x3f;
+ char *p = (char *)sctx->buffer + offset;
+ int padlen;
+ __be64 *pbits = (__be64 *)(((char *)&sctx->buffer) + 56);
+ __be32 *dst = (__be32 *)out;
+
+ padlen = 55 - offset;
+ *p++ = 0x80;
+
+ spe_begin();
+
+ if (padlen < 0) {
+ memset(p, 0x00, padlen + sizeof (u64));
+ ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
+ p = (char *)sctx->buffer;
+ padlen = 56;
+ }
+
+ memset(p, 0, padlen);
+ *pbits = cpu_to_be64(sctx->count << 3);
+ ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
+
+ spe_end();
+
+ dst[0] = cpu_to_be32(sctx->state[0]);
+ dst[1] = cpu_to_be32(sctx->state[1]);
+ dst[2] = cpu_to_be32(sctx->state[2]);
+ dst[3] = cpu_to_be32(sctx->state[3]);
+ dst[4] = cpu_to_be32(sctx->state[4]);
+
+ ppc_sha1_clear_context(sctx);
+ return 0;
+}
+
+static int ppc_spe_sha1_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+ return 0;
+}
+
+static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = ppc_spe_sha1_init,
+ .update = ppc_spe_sha1_update,
+ .final = ppc_spe_sha1_final,
+ .export = ppc_spe_sha1_export,
+ .import = ppc_spe_sha1_import,
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name= "sha1-ppc-spe",
+ .cra_priority = 300,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init ppc_spe_sha1_mod_init(void)
+{
+ return crypto_register_shash(&alg);
+}
+
+static void __exit ppc_spe_sha1_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(ppc_spe_sha1_mod_init);
+module_exit(ppc_spe_sha1_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, SPE optimized");
+
+MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-ppc-spe");
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
new file mode 100644
index 000000000..3911d5c25
--- /dev/null
+++ b/arch/powerpc/crypto/sha1.c
@@ -0,0 +1,157 @@
+/*
+ * Cryptographic API.
+ *
+ * powerpc implementation of the SHA1 Secure Hash Algorithm.
+ *
+ * Derived from cryptoapi implementation, adapted for in-place
+ * scatterlist interface.
+ *
+ * Derived from "crypto/sha1.c"
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+
+extern void powerpc_sha_transform(u32 *state, const u8 *src, u32 *temp);
+
+static int sha1_init(struct shash_desc *desc)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
+ return 0;
+}
+
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial, done;
+ const u8 *src;
+
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
+ done = 0;
+ src = data;
+
+ if ((partial + len) > 63) {
+ u32 temp[SHA_WORKSPACE_WORDS];
+
+ if (partial) {
+ done = -partial;
+ memcpy(sctx->buffer + partial, data, done + 64);
+ src = sctx->buffer;
+ }
+
+ do {
+ powerpc_sha_transform(sctx->state, src, temp);
+ done += 64;
+ src = data + done;
+ } while (done + 63 < len);
+
+ memzero_explicit(temp, sizeof(temp));
+ partial = 0;
+ }
+ memcpy(sctx->buffer + partial, src, len - done);
+
+ return 0;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ __be32 *dst = (__be32 *)out;
+ u32 i, index, padlen;
+ __be64 bits;
+ static const u8 padding[64] = { 0x80, };
+
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 */
+ index = sctx->count & 0x3f;
+ padlen = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha1_update(desc, padding, padlen);
+
+ /* Append length */
+ sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+
+ /* Store state in digest */
+ for (i = 0; i < 5; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof *sctx);
+
+ return 0;
+}
+
+static int sha1_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+ return 0;
+}
+
+static int sha1_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_init,
+ .update = sha1_update,
+ .final = sha1_final,
+ .export = sha1_export,
+ .import = sha1_import,
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name= "sha1-powerpc",
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sha1_powerpc_mod_init(void)
+{
+ return crypto_register_shash(&alg);
+}
+
+static void __exit sha1_powerpc_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_powerpc_mod_init);
+module_exit(sha1_powerpc_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
+
+MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-powerpc");
diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/crypto/sha256-spe-asm.S
new file mode 100644
index 000000000..2d10e4c08
--- /dev/null
+++ b/arch/powerpc/crypto/sha256-spe-asm.S
@@ -0,0 +1,323 @@
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash values in memory */
+#define rKP r24 /* pointer to round constants */
+#define rWP r4 /* pointer to input data */
+
+#define rH0 r5 /* 8 32 bit hash values in 8 registers */
+#define rH1 r6
+#define rH2 r7
+#define rH3 r8
+#define rH4 r9
+#define rH5 r10
+#define rH6 r11
+#define rH7 r12
+
+#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rT0 r22 /* 64 bit temporaries */
+#define rT1 r23
+#define rT2 r0 /* 32 bit temporaries */
+#define rT3 r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+ cmpwi rT1,0;
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1); \
+ stw r24,88(r1); /* save normal registers */ \
+ stw r25,92(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ lwz r24,88(r1); /* restore normal registers */ \
+ lwz r25,92(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* was already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+ LOAD_DATA(w, off) /* 1: W */ \
+ rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
+ rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
+ rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
+ and rT3,e,f; /* 1: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
+ andc rT1,g,e; /* 1: ch' = ~e and g */ \
+ lwz rT2,off(rKP); /* 1: K */ \
+ xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
+ add h,h,rT0; /* 1: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 1: temp1' = ch + w */ \
+ rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
+ add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + K */ \
+ rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
+ evmergelo w,w,w; /* shift W */ \
+ or rT2,a,b; /* 1: maj = a or b */ \
+ and rT1,a,b; /* 1: maj' = a and b */ \
+ and rT2,rT2,c; /* 1: maj = maj and c */ \
+ LOAD_DATA(w, off+4) /* 2: W */ \
+ or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ add h,h,rT3; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ lwz rT2,off+4(rKP); /* 2: K */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 2: temp1' = ch + w */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ add g,g,rT2; /* 2: temp1 = temp1 + K */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+ rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
+ evmergelohi rT0,w0,w1; /* w[-15] */ \
+ rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
+ evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
+ rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
+ evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
+ add h,h,rT2; /* 1: temp1 = h + S1 */ \
+ evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
+ and rT2,e,f; /* 1: ch = e and f */ \
+ evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
+ andc rT3,g,e; /* 1: ch' = ~e and g */ \
+ evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
+ xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
+ evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
+ evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
+ rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evldw rT1,off(rKP); /* k */ \
+ rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
+ evaddw w0,w0,rT0; /* w = w + s1 */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evmergelohi rT0,w4,w5; /* w[-7] */ \
+ and rT3,a,b; /* 1: maj = a and b */ \
+ evaddw w0,w0,rT0; /* w = w + w[-7] */ \
+ CMP_K##k##_LOOP \
+ add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
+ evaddw rT1,rT1,w0; /* wk = w + k */ \
+ xor rT3,a,b; /* 1: maj = a xor b */ \
+ evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
+ and rT3,rT3,c; /* 1: maj = maj and c */ \
+ add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
+ add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
+ add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add h,h,rT2; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+_GLOBAL(ppc_spe_sha256_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+ lwz rH4,16(rHP)
+ lwz rH5,20(rHP)
+ lwz rH6,24(rHP)
+ lwz rH7,28(rHP)
+
+ppc_spe_sha256_main:
+ lis rKP,PPC_SPE_SHA256_K@ha
+ addi rKP,rKP,PPC_SPE_SHA256_K@l
+
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+ addi rKP,rKP,64
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW0, rW1, rW4, rW5, rW7, N, 0)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW1, rW2, rW5, rW6, rW0, N, 8)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW2, rW3, rW6, rW7, rW1, N, 16)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW3, rW4, rW7, rW0, rW2, N, 24)
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW4, rW5, rW0, rW1, rW3, N, 32)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW5, rW6, rW1, rW2, rW4, N, 40)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW6, rW7, rW2, rW3, rW5, N, 48)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW7, rW0, rW3, rW4, rW6, C, 56)
+ bt gt,ppc_spe_sha256_16_rounds
+
+ lwz rW0,0(rHP)
+ NEXT_BLOCK
+ lwz rW1,4(rHP)
+ lwz rW2,8(rHP)
+ lwz rW3,12(rHP)
+ lwz rW4,16(rHP)
+ lwz rW5,20(rHP)
+ lwz rW6,24(rHP)
+ lwz rW7,28(rHP)
+
+ add rH0,rH0,rW0
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+ add rH5,rH5,rW5
+ stw rH5,20(rHP)
+ add rH6,rH6,rW6
+ stw rH6,24(rHP)
+ add rH7,rH7,rW7
+ stw rH7,28(rHP)
+
+ bdnz ppc_spe_sha256_main
+
+ FINALIZE
+ blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
new file mode 100644
index 000000000..6227888dc
--- /dev/null
+++ b/arch/powerpc/crypto/sha256-spe-glue.c
@@ -0,0 +1,274 @@
+/*
+ * Glue code for SHA-256 implementation for SPE instructions (PPC)
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/switch_to.h>
+#include <linux/hardirq.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static inline void ppc_sha256_clear_context(struct sha256_state *sctx)
+{
+ int count = sizeof(struct sha256_state) >> 2;
+ u32 *ptr = (u32 *)sctx;
+
+ /* make sure we can clear the fast way */
+ BUILD_BUG_ON(sizeof(struct sha256_state) % 4);
+ do { *ptr++ = 0; } while (--count);
+}
+
+static int ppc_spe_sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+static int ppc_spe_sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->count & 0x3f;
+ const unsigned int avail = 64 - offset;
+ unsigned int bytes;
+ const u8 *src = data;
+
+ if (avail > len) {
+ sctx->count += len;
+ memcpy((char *)sctx->buf + offset, src, len);
+ return 0;
+ }
+
+ sctx->count += len;
+
+ if (offset) {
+ memcpy((char *)sctx->buf + offset, src, avail);
+
+ spe_begin();
+ ppc_spe_sha256_transform(sctx->state, (const u8 *)sctx->buf, 1);
+ spe_end();
+
+ len -= avail;
+ src += avail;
+ }
+
+ while (len > 63) {
+ /* cut input data into smaller blocks */
+ bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
+ bytes = bytes & ~0x3f;
+
+ spe_begin();
+ ppc_spe_sha256_transform(sctx->state, src, bytes >> 6);
+ spe_end();
+
+ src += bytes;
+ len -= bytes;
+ };
+
+ memcpy((char *)sctx->buf, src, len);
+ return 0;
+}
+
+static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ const unsigned int offset = sctx->count & 0x3f;
+ char *p = (char *)sctx->buf + offset;
+ int padlen;
+ __be64 *pbits = (__be64 *)(((char *)&sctx->buf) + 56);
+ __be32 *dst = (__be32 *)out;
+
+ padlen = 55 - offset;
+ *p++ = 0x80;
+
+ spe_begin();
+
+ if (padlen < 0) {
+ memset(p, 0x00, padlen + sizeof (u64));
+ ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
+ p = (char *)sctx->buf;
+ padlen = 56;
+ }
+
+ memset(p, 0, padlen);
+ *pbits = cpu_to_be64(sctx->count << 3);
+ ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
+
+ spe_end();
+
+ dst[0] = cpu_to_be32(sctx->state[0]);
+ dst[1] = cpu_to_be32(sctx->state[1]);
+ dst[2] = cpu_to_be32(sctx->state[2]);
+ dst[3] = cpu_to_be32(sctx->state[3]);
+ dst[4] = cpu_to_be32(sctx->state[4]);
+ dst[5] = cpu_to_be32(sctx->state[5]);
+ dst[6] = cpu_to_be32(sctx->state[6]);
+ dst[7] = cpu_to_be32(sctx->state[7]);
+
+ ppc_sha256_clear_context(sctx);
+ return 0;
+}
+
+static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u32 D[SHA256_DIGEST_SIZE >> 2];
+ __be32 *dst = (__be32 *)out;
+
+ ppc_spe_sha256_final(desc, (u8 *)D);
+
+ /* avoid bytewise memcpy */
+ dst[0] = D[0];
+ dst[1] = D[1];
+ dst[2] = D[2];
+ dst[3] = D[3];
+ dst[4] = D[4];
+ dst[5] = D[5];
+ dst[6] = D[6];
+
+ /* clear sensitive data */
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+ return 0;
+}
+
+static int ppc_spe_sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+ return 0;
+}
+
+static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+ return 0;
+}
+
+static struct shash_alg algs[2] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = ppc_spe_sha256_init,
+ .update = ppc_spe_sha256_update,
+ .final = ppc_spe_sha256_final,
+ .export = ppc_spe_sha256_export,
+ .import = ppc_spe_sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name= "sha256-ppc-spe",
+ .cra_priority = 300,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = ppc_spe_sha224_init,
+ .update = ppc_spe_sha256_update,
+ .final = ppc_spe_sha224_final,
+ .export = ppc_spe_sha256_export,
+ .import = ppc_spe_sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name= "sha224-ppc-spe",
+ .cra_priority = 300,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init ppc_spe_sha256_mod_init(void)
+{
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit ppc_spe_sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(ppc_spe_sha256_mod_init);
+module_exit(ppc_spe_sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized");
+
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha224-ppc-spe");
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha256-ppc-spe");