diff options
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/rijndael-s390x.c')
-rw-r--r-- | comm/third_party/libgcrypt/cipher/rijndael-s390x.c | 1155 |
1 files changed, 1155 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-s390x.c b/comm/third_party/libgcrypt/cipher/rijndael-s390x.c new file mode 100644 index 0000000000..aea65c5a3d --- /dev/null +++ b/comm/third_party/libgcrypt/cipher/rijndael-s390x.c @@ -0,0 +1,1155 @@ +/* Rijndael (AES) for GnuPG - s390x/zSeries AES implementation + * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#include "rijndael-internal.h" +#include "cipher-internal.h" +#include "bufhelp.h" + +#ifdef USE_S390X_CRYPTO + +#include "asm-inline-s390x.h" + +#define NO_INLINE __attribute__((noinline)) + +struct aes_s390x_gcm_params_s +{ + u32 reserved[3]; + u32 counter_value; + u64 tag[2]; + u64 hash_subkey[2]; + u64 total_aad_length; + u64 total_cipher_length; + u32 initial_counter_value[4]; + u64 key[4]; +}; + +#define DECL_QUERY_FUNC(instruction, opcode) \ + static u128_t instruction ##_query(void) \ + { \ + static u128_t function_codes = 0; \ + static int initialized = 0; \ + register unsigned long reg0 asm("0") = 0; \ + register void *reg1 asm("1") = &function_codes; \ + u128_t r1, r2; \ + \ + if (initialized) \ + return function_codes; \ + \ + asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \ + " brc 1,0b\n\t" \ + : [r1] "=a" (r1), [r2] "=a" (r2) \ + : [reg0] "r" (reg0), [reg1] "r" (reg1) \ + : "cc", "memory"); \ + \ + initialized = 1; \ + return function_codes; \ + } + +#define DECL_EXECUTE_FUNC(instruction, opcode, param_const) \ + static ALWAYS_INLINE size_t \ + instruction ##_execute(unsigned int func, param_const void *param_block, \ + void *dst, const void *src, size_t src_len) \ + { \ + register unsigned long reg0 asm("0") = func; \ + register param_const byte *reg1 asm("1") = param_block; \ + u128_t r1 = ((u128_t)(uintptr_t)dst << 64); \ + u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; \ + \ + asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \ + " brc 1,0b\n\t" \ + : [r1] "+a" (r1), [r2] "+a" (r2) \ + : [func] "r" (reg0), [param_ptr] "r" (reg1) \ + : "cc", "memory"); \ + \ + return (u64)r2; \ + } + +DECL_QUERY_FUNC(km, 0xb92e); +DECL_QUERY_FUNC(kmc, 0xb92f); +DECL_QUERY_FUNC(kmac, 0xb91e); +DECL_QUERY_FUNC(kmf, 0xb92a); +DECL_QUERY_FUNC(kmo, 0xb92b); + +DECL_EXECUTE_FUNC(km, 0xb92e, const); +DECL_EXECUTE_FUNC(kmc, 0xb92f, ); +DECL_EXECUTE_FUNC(kmac, 0xb91e, ); +DECL_EXECUTE_FUNC(kmf, 0xb92a, ); +DECL_EXECUTE_FUNC(kmo, 0xb92b, ); + +static u128_t kma_query(void) +{ + static u128_t function_codes = 0; + static int initialized = 0; + register unsigned long reg0 asm("0") = 0; + register void *reg1 asm("1") = &function_codes; + u128_t r1, r2, r3; + + if (initialized) + return function_codes; + + asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" + " brc 1,0b\n\t" + : [r1] "=a" (r1), [r2] "=a" (r2), [r3] "=a" (r3) + : [reg0] "r" (reg0), [reg1] "r" (reg1) + : "cc", "memory"); + + initialized = 1; + return function_codes; +} + +static ALWAYS_INLINE void +kma_execute(unsigned int func, void *param_block, byte *dst, const byte *src, + size_t src_len, const byte *aad, size_t aad_len) +{ + register unsigned long reg0 asm("0") = func; + register byte *reg1 asm("1") = param_block; + u128_t r1 = ((u128_t)(uintptr_t)dst << 64); + u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; + u128_t r3 = ((u128_t)(uintptr_t)aad << 64) | (u64)aad_len; + + asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t" + " brc 1,0b\n\t" + : [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3), + [func] "+r" (reg0) + : [param_ptr] "r" (reg1) + : "cc", "memory"); +} + +unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx, + unsigned char *dst, + const unsigned char *src) +{ + km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, dst, src, + BLOCKSIZE); + return 0; +} + +unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx, + unsigned char *dst, + const unsigned char *src) +{ + km_execute (ctx->km_func | KM_DECRYPT, ctx->keyschenc, dst, src, + BLOCKSIZE); + return 0; +} + +static void aes_s390x_cbc_enc(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + u128_t params[3]; + + /* Prepare parameter block. */ + memcpy (¶ms[0], iv, BLOCKSIZE); + memcpy (¶ms[1], ctx->keyschenc, 32); + + if (cbc_mac) + { + kmac_execute (ctx->kmac_func | KM_ENCRYPT, ¶ms, NULL, in, + nblocks * BLOCKSIZE); + memcpy (out, ¶ms[0], BLOCKSIZE); + } + else + { + kmc_execute (ctx->kmc_func | KM_ENCRYPT, ¶ms, out, in, + nblocks * BLOCKSIZE); + } + + /* Update IV with OCV. */ + memcpy (iv, ¶ms[0], BLOCKSIZE); + + wipememory (¶ms, sizeof(params)); +} + +static void aes_s390x_cbc_dec(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + u128_t params[3]; + + /* Prepare parameter block (ICV & key). */ + memcpy (¶ms[0], iv, BLOCKSIZE); + memcpy (¶ms[1], ctx->keyschenc, 32); + + kmc_execute (ctx->kmc_func | KM_DECRYPT, ¶ms, out, in, + nblocks * BLOCKSIZE); + + /* Update IV with OCV. */ + memcpy (iv, ¶ms[0], BLOCKSIZE); + + wipememory (¶ms, sizeof(params)); +} + +static void aes_s390x_cfb128_enc(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + unsigned int function; + u128_t params[3]; + + /* Prepare parameter block. */ + memcpy (¶ms[0], iv, BLOCKSIZE); + memcpy (¶ms[1], ctx->keyschenc, 32); + + function = ctx->kmf_func | KM_ENCRYPT | KMF_LCFB_16; + kmf_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); + + /* Update IV with OCV. */ + memcpy (iv, ¶ms[0], BLOCKSIZE); + + wipememory (¶ms, sizeof(params)); +} + +static void aes_s390x_cfb128_dec(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + u128_t blocks[64]; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + size_t max_blocks_used = 0; + + /* AES128-CFB128 decryption speed using KMF was observed to be the same as + * the KMF encryption, ~1.03 cpb. Expection was to see similar performance + * as for AES128-CBC decryption as decryption for both modes should be + * parallalizeble (CBC shows ~0.22 cpb). Therefore there is quite a bit + * of room for improvement and implementation below using KM instruction + * shows ~0.70 cpb speed, ~30% improvement over KMF instruction. + */ + + while (nblocks >= 64) + { + /* Copy IV to encrypt buffer, copy (nblocks - 1) input blocks to + * encrypt buffer and update IV. */ + asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t" + "mvc 16(240, %[blocks]), 0(%[in])\n\t" + "mvc 256(256, %[blocks]), 240(%[in])\n\t" + "mvc 512(256, %[blocks]), 496(%[in])\n\t" + "mvc 768(256, %[blocks]), 752(%[in])\n\t" + "mvc 0(16, %[iv]), 1008(%[in])\n\t" + : + : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks), + [iv] "a" (iv) + : "memory"); + + /* Perform encryption of temporary buffer. */ + km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks, + 64 * BLOCKSIZE); + + /* Xor encrypt buffer with input blocks and store to output blocks. */ + asm volatile ("xc 0(256, %[blocks]), 0(%[in])\n\t" + "xc 256(256, %[blocks]), 256(%[in])\n\t" + "xc 512(256, %[blocks]), 512(%[in])\n\t" + "xc 768(256, %[blocks]), 768(%[in])\n\t" + "mvc 0(256, %[out]), 0(%[blocks])\n\t" + "mvc 256(256, %[out]), 256(%[blocks])\n\t" + "mvc 512(256, %[out]), 512(%[blocks])\n\t" + "mvc 768(256, %[out]), 768(%[blocks])\n\t" + : + : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks) + : "memory"); + + max_blocks_used = 64; + in += 64 * BLOCKSIZE; + out += 64 * BLOCKSIZE; + nblocks -= 64; + } + + if (nblocks) + { + unsigned int pos = 0; + size_t in_nblocks = nblocks; + size_t num_in = 0; + + max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; + + /* Copy IV to encrypt buffer. */ + asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t" + : + : [blocks] "a" (blocks), [iv] "a" (iv) + : "memory"); + pos += 1; + +#define CFB_MOVE_BLOCKS(block_oper, move_nbytes) \ + block_oper (in_nblocks - 1 >= move_nbytes / BLOCKSIZE) \ + { \ + unsigned int move_nblocks = move_nbytes / BLOCKSIZE; \ + asm volatile ("mvc 0(" #move_nbytes ", %[blocks_x]), 0(%[in])\n\t" \ + : \ + : [blocks_x] "a" (&blocks[pos]), [in] "a" (in) \ + : "memory"); \ + num_in += move_nblocks; \ + in += move_nblocks * BLOCKSIZE; \ + pos += move_nblocks; \ + in_nblocks -= move_nblocks; \ + } + + /* Copy (nblocks - 1) input blocks to encrypt buffer. */ + CFB_MOVE_BLOCKS(while, 256); + CFB_MOVE_BLOCKS(if, 128); + CFB_MOVE_BLOCKS(if, 64); + CFB_MOVE_BLOCKS(if, 32); + CFB_MOVE_BLOCKS(if, 16); + +#undef CFB_MOVE_BLOCKS + + /* Update IV. */ + asm volatile ("mvc 0(16, %[iv]), 0(%[in])\n\t" + : + : [iv] "a" (iv), [in] "a" (in) + : "memory"); + num_in += 1; + in += BLOCKSIZE; + + /* Perform encryption of temporary buffer. */ + km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks, + nblocks * BLOCKSIZE); + + /* Xor encrypt buffer with input blocks and store to output blocks. */ + pos = 0; + in -= nblocks * BLOCKSIZE; + +#define CFB_XOR_BLOCKS(block_oper, xor_nbytes) \ + block_oper (nblocks >= xor_nbytes / BLOCKSIZE) \ + { \ + unsigned int xor_nblocks = xor_nbytes / BLOCKSIZE; \ + asm volatile ("xc 0(" #xor_nbytes ", %[blocks_x]), 0(%[in])\n\t" \ + "mvc 0(" #xor_nbytes ", %[out]), 0(%[blocks_x])\n\t" \ + : \ + : [blocks_x] "a" (&blocks[pos]), [out] "a" (out), \ + [in] "a" (in) \ + : "memory"); \ + out += xor_nblocks * BLOCKSIZE; \ + in += xor_nblocks * BLOCKSIZE; \ + nblocks -= xor_nblocks; \ + pos += xor_nblocks; \ + } + + CFB_XOR_BLOCKS(while, 256); + CFB_XOR_BLOCKS(if, 128); + CFB_XOR_BLOCKS(if, 64); + CFB_XOR_BLOCKS(if, 32); + CFB_XOR_BLOCKS(if, 16); + +#undef CFB_XOR_BLOCKS + } + + if (max_blocks_used) + wipememory (&blocks, max_blocks_used * BLOCKSIZE); +} + +static void aes_s390x_ofb_enc(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + unsigned int function; + u128_t params[3]; + + /* Prepare parameter block. */ + memcpy (¶ms[0], iv, BLOCKSIZE); + memcpy (¶ms[1], ctx->keyschenc, 32); + + function = ctx->kmo_func | KM_ENCRYPT; + kmo_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); + + /* Update IV with OCV. */ + memcpy (iv, ¶ms[0], BLOCKSIZE); + + wipememory (¶ms, sizeof(params)); +} + +static void aes_s390x_ctr128_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + unsigned int function; + struct aes_s390x_gcm_params_s params; + + memset (¶ms.hash_subkey, 0, sizeof(params.hash_subkey)); + memcpy (¶ms.key, ctx->keyschenc, 32); + + function = ctx->kma_func | KM_DECRYPT | KMA_HS | KMA_LAAD; + + while (nblocks) + { + u64 to_overflow = (u64)0xFFFFFFFFU + 1 - buf_get_be32 (ctr + 12); + u64 ncurr = nblocks > to_overflow ? to_overflow : nblocks; + + /* Prepare parameter block. */ + memset (¶ms.reserved, 0, sizeof(params.reserved)); + buf_put_be32 (¶ms.counter_value, buf_get_be32(ctr + 12) - 1); + memcpy (¶ms.initial_counter_value, ctr, 16); + params.initial_counter_value[3] = params.counter_value; + memset (¶ms.tag, 0, sizeof(params.tag)); + params.total_aad_length = 0; + params.total_cipher_length = 0; + + /* Update counter. */ + cipher_block_add (ctr, ncurr, BLOCKSIZE); + if (ncurr == (u64)0xFFFFFFFFU + 1) + cipher_block_add (ctr, 1, BLOCKSIZE); + + /* Perform CTR using KMA-GCM. */ + kma_execute (function, ¶ms, out, in, ncurr * BLOCKSIZE, NULL, 0); + + out += ncurr * BLOCKSIZE; + in += ncurr * BLOCKSIZE; + nblocks -= ncurr; + } + + wipememory (¶ms, sizeof(params)); +} + +static size_t aes_s390x_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + byte *ctr = c->u_ctr.ctr; + unsigned int function; + struct aes_s390x_gcm_params_s params; + + function = ctx->kma_func | (encrypt ? KM_ENCRYPT : KM_DECRYPT) + | KMA_HS | KMA_LAAD; + + /* Prepare parameter block. */ + memset (¶ms.reserved, 0, sizeof(params.reserved)); + buf_put_be32 (¶ms.counter_value, buf_get_be32(ctr + 12) - 1); + memcpy (¶ms.tag, c->u_mode.gcm.u_tag.tag, 16); + memcpy (¶ms.hash_subkey, c->u_mode.gcm.u_ghash_key.key, 16); + params.total_aad_length = 0; + params.total_cipher_length = 0; + memcpy (¶ms.initial_counter_value, ctr, 12); + params.initial_counter_value[3] = params.counter_value; + memcpy (¶ms.key, ctx->keyschenc, 32); + + /* Update counter (CTR32). */ + buf_put_be32(ctr + 12, buf_get_be32(ctr + 12) + nblocks); + + /* Perform KMA-GCM. */ + kma_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE, NULL, 0); + + /* Update tag. */ + memcpy (c->u_mode.gcm.u_tag.tag, ¶ms.tag, 16); + + wipememory (¶ms, sizeof(params)); + + return 0; +} + +static void aes_s390x_xts_crypt(void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt) +{ + RIJNDAEL_context *ctx = context; + byte *out = outbuf_arg; + const byte *in = inbuf_arg; + unsigned int function; + u128_t params[3]; + u128_t *params_tweak; + + if (ctx->rounds < 12) + { + memcpy (¶ms[0], ctx->keyschenc, 16); + params_tweak = ¶ms[1]; + memcpy (params_tweak, tweak, BLOCKSIZE); + } + else if (ctx->rounds == 12) + { + BUG(); /* KM-XTS-AES-192 not defined. */ + } + else + { + memcpy (¶ms[0], ctx->keyschenc, 32); + params_tweak = ¶ms[2]; + memcpy (params_tweak, tweak, BLOCKSIZE); + } + + function = ctx->km_func_xts | (encrypt ? KM_ENCRYPT : KM_DECRYPT); + km_execute (function, ¶ms, out, in, nblocks * BLOCKSIZE); + + /* Update tweak with XTSP. */ + memcpy (tweak, params_tweak, BLOCKSIZE); + + wipememory (¶ms, sizeof(params)); +} + +static NO_INLINE void +aes_s390x_ocb_prepare_Ls (gcry_cipher_hd_t c, u64 blkn, const void *Ls[64], + const void ***pl) +{ + unsigned int n = 64 - (blkn % 64); + int i; + + /* Prepare L pointers. */ + *pl = &Ls[(63 + n) % 64]; + for (i = 0; i < 64; i += 8, n = (n + 8) % 64) + { + static const int lastL[8] = { 3, 4, 3, 5, 3, 4, 3, 0 }; + + Ls[(0 + n) % 64] = c->u_mode.ocb.L[0]; + Ls[(1 + n) % 64] = c->u_mode.ocb.L[1]; + Ls[(2 + n) % 64] = c->u_mode.ocb.L[0]; + Ls[(3 + n) % 64] = c->u_mode.ocb.L[2]; + Ls[(4 + n) % 64] = c->u_mode.ocb.L[0]; + Ls[(5 + n) % 64] = c->u_mode.ocb.L[1]; + Ls[(6 + n) % 64] = c->u_mode.ocb.L[0]; + Ls[(7 + n) % 64] = c->u_mode.ocb.L[lastL[i / 8]]; + } +} + +static NO_INLINE void +aes_s390x_ocb_checksum (unsigned char *checksum, const void *plainbuf_arg, + size_t nblks) +{ + const char *plainbuf = plainbuf_arg; + u64 tmp0[2]; + u64 tmp1[2] = { 0, 0 }; + u64 tmp2[2] = { 0, 0 }; + u64 tmp3[2] = { 0, 0 }; + + cipher_block_cpy (tmp0, checksum, BLOCKSIZE); + + if (nblks >= 4) + { + while (nblks >= 4) + { + /* Checksum_i = Checksum_{i-1} xor P_i */ + cipher_block_xor_1 (tmp0, plainbuf + 0 * BLOCKSIZE, BLOCKSIZE); + cipher_block_xor_1 (tmp1, plainbuf + 1 * BLOCKSIZE, BLOCKSIZE); + cipher_block_xor_1 (tmp2, plainbuf + 2 * BLOCKSIZE, BLOCKSIZE); + cipher_block_xor_1 (tmp3, plainbuf + 3 * BLOCKSIZE, BLOCKSIZE); + + plainbuf += 4 * BLOCKSIZE; + nblks -= 4; + } + + cipher_block_xor_1 (tmp0, tmp1, BLOCKSIZE); + cipher_block_xor_1 (tmp2, tmp3, BLOCKSIZE); + cipher_block_xor_1 (tmp0, tmp2, BLOCKSIZE); + + wipememory (tmp1, sizeof(tmp1)); + wipememory (tmp2, sizeof(tmp2)); + wipememory (tmp3, sizeof(tmp3)); + } + + while (nblks > 0) + { + /* Checksum_i = Checksum_{i-1} xor P_i */ + cipher_block_xor_1 (tmp0, plainbuf, BLOCKSIZE); + + plainbuf += BLOCKSIZE; + nblks--; + } + + cipher_block_cpy (checksum, tmp0, BLOCKSIZE); + + wipememory (tmp0, sizeof(tmp0)); +} + +static NO_INLINE size_t +aes_s390x_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks_arg) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + size_t nblocks = nblocks_arg; + u128_t blocks[64]; + u128_t offset; + size_t max_blocks_used = 0; + u64 blkn = c->u_mode.ocb.data_nblocks; + unsigned int function = ctx->km_func | KM_ENCRYPT; + const void *Ls[64]; + const void **pl; + + aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); + + /* Checksumming could be done inline in OCB_INPUT macros, but register + * pressure becomes too heavy and performance would end up being worse. + * For decryption, checksumming is part of OCB_OUTPUT macros as + * output handling is less demanding and can handle the additional + * computation. */ + aes_s390x_ocb_checksum (c->u_ctr.ctr, inbuf_arg, nblocks_arg); + + cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE); + +#define OCB_INPUT(n) \ + cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ + cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \ + &offset, BLOCKSIZE) + +#define OCB_INPUT_4(n) \ + OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ + OCB_INPUT((n) + 3) + +#define OCB_INPUT_16(n) \ + OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ + OCB_INPUT_4((n) + 12); + +#define OCB_OUTPUT(n) \ + cipher_block_xor_1 (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE) + +#define OCB_OUTPUT_4(n) \ + OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \ + OCB_OUTPUT((n) + 3) + +#define OCB_OUTPUT_16(n) \ + OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \ + OCB_OUTPUT_4((n) + 12); + + while (nblocks >= 64) + { + blkn += 64; + *pl = ocb_get_l(c, blkn - blkn % 64); + + OCB_INPUT_16(0); + OCB_INPUT_16(16); + OCB_INPUT_16(32); + OCB_INPUT_16(48); + + km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE); + + asm volatile ("xc 0(256, %[out]), 0(%[blocks])\n\t" + "xc 256(256, %[out]), 256(%[blocks])\n\t" + "xc 512(256, %[out]), 512(%[blocks])\n\t" + "xc 768(256, %[out]), 768(%[blocks])\n\t" + : + : [out] "a" (outbuf), [blocks] "a" (blocks) + : "memory"); + + max_blocks_used = 64; + inbuf += 64 * BLOCKSIZE; + outbuf += 64 * BLOCKSIZE; + nblocks -= 64; + } + + if (nblocks) + { + unsigned int pos = 0; + + max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; + + blkn += nblocks; + *pl = ocb_get_l(c, blkn - blkn % 64); + + while (nblocks >= 16) + { + OCB_INPUT_16(pos + 0); + pos += 16; + nblocks -= 16; + } + while (nblocks >= 4) + { + OCB_INPUT_4(pos + 0); + pos += 4; + nblocks -= 4; + } + if (nblocks >= 2) + { + OCB_INPUT(pos + 0); + OCB_INPUT(pos + 1); + pos += 2; + nblocks -= 2; + } + if (nblocks >= 1) + { + OCB_INPUT(pos + 0); + pos += 1; + nblocks -= 1; + } + + nblocks = pos; + pos = 0; + km_execute (function, ctx->keyschenc, outbuf, outbuf, + nblocks * BLOCKSIZE); + + while (nblocks >= 16) + { + OCB_OUTPUT_16(pos + 0); + pos += 16; + nblocks -= 16; + } + while (nblocks >= 4) + { + OCB_OUTPUT_4(pos + 0); + pos += 4; + nblocks -= 4; + } + if (nblocks >= 2) + { + OCB_OUTPUT(pos + 0); + OCB_OUTPUT(pos + 1); + pos += 2; + nblocks -= 2; + } + if (nblocks >= 1) + { + OCB_OUTPUT(pos + 0); + pos += 1; + nblocks -= 1; + } + } + +#undef OCB_INPUT +#undef OCB_INPUT_4 +#undef OCB_INPUT_16 +#undef OCB_OUTPUT +#undef OCB_OUTPUT_4 +#undef OCB_OUTPUT_16 + + c->u_mode.ocb.data_nblocks = blkn; + cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE); + + if (max_blocks_used) + wipememory (&blocks, max_blocks_used * BLOCKSIZE); + + return 0; +} + +static NO_INLINE size_t +aes_s390x_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks_arg) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + size_t nblocks = nblocks_arg; + u128_t blocks[64]; + u128_t offset; + size_t max_blocks_used = 0; + u64 blkn = c->u_mode.ocb.data_nblocks; + unsigned int function = ctx->km_func | KM_DECRYPT; + const void *Ls[64]; + const void **pl; + + aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); + + cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE); + +#define OCB_INPUT(n) \ + cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ + cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \ + &offset, BLOCKSIZE) + +#define OCB_INPUT_4(n) \ + OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ + OCB_INPUT((n) + 3) + +#define OCB_INPUT_16(n) \ + OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ + OCB_INPUT_4((n) + 12); + +#define OCB_OUTPUT(n) \ + cipher_block_xor_1 (&blocks[n], outbuf + (n) * BLOCKSIZE, BLOCKSIZE); \ + cipher_block_xor_1 (c->u_ctr.ctr, &blocks[n], BLOCKSIZE); \ + cipher_block_cpy (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE); + +#define OCB_OUTPUT_4(n) \ + OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \ + OCB_OUTPUT((n) + 3) + +#define OCB_OUTPUT_16(n) \ + OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \ + OCB_OUTPUT_4((n) + 12); + + while (nblocks >= 64) + { + blkn += 64; + *pl = ocb_get_l(c, blkn - blkn % 64); + + OCB_INPUT_16(0); + OCB_INPUT_16(16); + OCB_INPUT_16(32); + OCB_INPUT_16(48); + + km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE); + + asm volatile ("xc 0(256, %[out]), 0(%[blocks])\n\t" + "xc 256(256, %[out]), 256(%[blocks])\n\t" + "xc 512(256, %[out]), 512(%[blocks])\n\t" + "xc 768(256, %[out]), 768(%[blocks])\n\t" + : + : [out] "a" (outbuf), [blocks] "a" (blocks) + : "memory"); + + max_blocks_used = 64; + inbuf += 64 * BLOCKSIZE; + outbuf += 64 * BLOCKSIZE; + nblocks -= 64; + } + + if (nblocks) + { + unsigned int pos = 0; + + max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; + + blkn += nblocks; + *pl = ocb_get_l(c, blkn - blkn % 64); + + while (nblocks >= 16) + { + OCB_INPUT_16(pos + 0); + pos += 16; + nblocks -= 16; + } + while (nblocks >= 4) + { + OCB_INPUT_4(pos + 0); + pos += 4; + nblocks -= 4; + } + if (nblocks >= 2) + { + OCB_INPUT(pos + 0); + OCB_INPUT(pos + 1); + pos += 2; + nblocks -= 2; + } + if (nblocks >= 1) + { + OCB_INPUT(pos + 0); + pos += 1; + nblocks -= 1; + } + + nblocks = pos; + pos = 0; + km_execute (function, ctx->keyschenc, outbuf, outbuf, + nblocks * BLOCKSIZE); + + while (nblocks >= 16) + { + OCB_OUTPUT_16(pos + 0); + pos += 16; + nblocks -= 16; + } + while (nblocks >= 4) + { + OCB_OUTPUT_4(pos + 0); + pos += 4; + nblocks -= 4; + } + if (nblocks >= 2) + { + OCB_OUTPUT(pos + 0); + OCB_OUTPUT(pos + 1); + pos += 2; + nblocks -= 2; + } + if (nblocks >= 1) + { + OCB_OUTPUT(pos + 0); + pos += 1; + nblocks -= 1; + } + } + +#undef OCB_INPUT +#undef OCB_INPUT_4 +#undef OCB_INPUT_16 +#undef OCB_OUTPUT +#undef OCB_OUTPUT_4 +#undef OCB_OUTPUT_16 + + c->u_mode.ocb.data_nblocks = blkn; + cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE); + + if (max_blocks_used) + wipememory (&blocks, max_blocks_used * BLOCKSIZE); + + return 0; +} + +static size_t +aes_s390x_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks_arg, int encrypt) +{ + if (encrypt) + return aes_s390x_ocb_enc (c, outbuf_arg, inbuf_arg, nblocks_arg); + else + return aes_s390x_ocb_dec (c, outbuf_arg, inbuf_arg, nblocks_arg); +} + +static size_t +aes_s390x_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks_arg) +{ + RIJNDAEL_context *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + u128_t blocks[64]; + u128_t offset; + size_t max_blocks_used = 0; + u64 blkn = c->u_mode.ocb.aad_nblocks; + unsigned int function = ctx->km_func | KM_ENCRYPT; + const void *Ls[64]; + const void **pl; + + aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl); + + cipher_block_cpy (&offset, c->u_mode.ocb.aad_offset, BLOCKSIZE); + +#define OCB_INPUT(n) \ + cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \ + cipher_block_xor_1 (&blocks[n], abuf + (n) * BLOCKSIZE, BLOCKSIZE) + +#define OCB_INPUT_4(n) \ + OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \ + OCB_INPUT((n) + 3) + +#define OCB_INPUT_16(n) \ + OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \ + OCB_INPUT_4((n) + 12); + + while (nblocks_arg >= 64) + { + blkn += 64; + *pl = ocb_get_l(c, blkn - blkn % 64); + + OCB_INPUT_16(0); + OCB_INPUT_16(16); + OCB_INPUT_16(32); + OCB_INPUT_16(48); + + km_execute (function, ctx->keyschenc, blocks, blocks, 64 * BLOCKSIZE); + + aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, 64); + + max_blocks_used = 64; + abuf += 64 * BLOCKSIZE; + nblocks_arg -= 64; + } + + if (nblocks_arg > 0) + { + size_t nblocks = nblocks_arg; + unsigned int pos = 0; + + max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used; + + blkn += nblocks; + *pl = ocb_get_l(c, blkn - blkn % 64); + + while (nblocks >= 16) + { + OCB_INPUT_16(pos + 0); + pos += 16; + nblocks -= 16; + } + while (nblocks >= 4) + { + OCB_INPUT_4(pos + 0); + pos += 4; + nblocks -= 4; + } + if (nblocks >= 2) + { + OCB_INPUT(pos + 0); + OCB_INPUT(pos + 1); + pos += 2; + nblocks -= 2; + } + if (nblocks >= 1) + { + OCB_INPUT(pos + 0); + pos += 1; + nblocks -= 1; + } + + nblocks = pos; + nblocks_arg -= pos; + pos = 0; + km_execute (function, ctx->keyschenc, blocks, blocks, + nblocks * BLOCKSIZE); + + aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, nblocks); + } + +#undef OCB_INPUT +#undef OCB_INPUT_4 +#undef OCB_INPUT_16 + + c->u_mode.ocb.aad_nblocks = blkn; + cipher_block_cpy (c->u_mode.ocb.aad_offset, &offset, BLOCKSIZE); + + if (max_blocks_used) + wipememory (&blocks, max_blocks_used * BLOCKSIZE); + + return 0; +} + +int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx, + unsigned int keylen, + unsigned int hwfeatures, + cipher_bulk_ops_t *bulk_ops) +{ + unsigned int func; + unsigned int func_xts; + u128_t func_mask; + u128_t func_xts_mask; + + if (!(hwfeatures & HWF_S390X_MSA)) + return 0; + + switch (keylen) + { + default: + case 16: + func = KM_FUNCTION_AES_128; + func_xts = KM_FUNCTION_XTS_AES_128; + func_mask = km_function_to_mask(KM_FUNCTION_AES_128); + func_xts_mask = km_function_to_mask(KM_FUNCTION_XTS_AES_128); + break; + case 24: + func = KM_FUNCTION_AES_192; + func_xts = 0; + func_mask = km_function_to_mask(KM_FUNCTION_AES_192); + func_xts_mask = 0; /* XTS-AES192 not available. */ + break; + case 32: + func = KM_FUNCTION_AES_256; + func_xts = KM_FUNCTION_XTS_AES_256; + func_mask = km_function_to_mask(KM_FUNCTION_AES_256); + func_xts_mask = km_function_to_mask(KM_FUNCTION_AES_256); + break; + } + + /* Query KM for supported algorithms and check if acceleration for + * requested key-length is available. */ + if (!(km_query () & func_mask)) + return 0; + + ctx->km_func = func; + + /* Query KM for supported XTS algorithms. */ + if (km_query () & func_xts_mask) + ctx->km_func_xts = func_xts; + + /* Query KMC for supported algorithms. */ + if (kmc_query () & func_mask) + ctx->kmc_func = func; + + /* Query KMAC for supported algorithms. */ + if (kmac_query () & func_mask) + ctx->kmac_func = func; + + if (hwfeatures & HWF_S390X_MSA_4) + { + /* Query KMF for supported algorithms. */ + if (kmf_query () & func_mask) + ctx->kmf_func = func; + + /* Query KMO for supported algorithms. */ + if (kmo_query () & func_mask) + ctx->kmo_func = func; + } + + if (hwfeatures & HWF_S390X_MSA_8) + { + /* Query KMA for supported algorithms. */ + if (kma_query () & func_mask) + ctx->kma_func = func; + } + + /* Setup zSeries bulk encryption/decryption routines. */ + + if (ctx->km_func) + { + bulk_ops->ocb_crypt = aes_s390x_ocb_crypt; + bulk_ops->ocb_auth = aes_s390x_ocb_auth; + + /* CFB128 decryption uses KM instruction, instead of KMF. */ + bulk_ops->cfb_dec = aes_s390x_cfb128_dec; + } + + if (ctx->km_func_xts) + { + bulk_ops->xts_crypt = aes_s390x_xts_crypt; + } + + if (ctx->kmc_func) + { + if(ctx->kmac_func) + { + /* Either KMC or KMAC used depending on 'cbc_mac' parameter. */ + bulk_ops->cbc_enc = aes_s390x_cbc_enc; + } + + bulk_ops->cbc_dec = aes_s390x_cbc_dec; + } + + if (ctx->kmf_func) + { + bulk_ops->cfb_enc = aes_s390x_cfb128_enc; + } + + if (ctx->kmo_func) + { + bulk_ops->ofb_enc = aes_s390x_ofb_enc; + } + + if (ctx->kma_func) + { + bulk_ops->ctr_enc = aes_s390x_ctr128_enc; + + if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH)) + { + /* KIMD based GHASH implementation is required with AES-GCM + * acceleration. */ + bulk_ops->gcm_crypt = aes_s390x_gcm_crypt; + } + } + + return 1; +} + +void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key) +{ + unsigned int keylen = 16 + (ctx->rounds - 10) * 4; + memcpy (ctx->keyschenc, key, keylen); +} + +void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx) +{ + /* Do nothing. */ + (void)ctx; +} + +#endif /* USE_S390X_CRYPTO */ |