diff options
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S')
-rw-r--r-- | comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S | 1867 |
1 files changed, 1867 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S new file mode 100644 index 0000000000..66440bd4eb --- /dev/null +++ b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S @@ -0,0 +1,1867 @@ +/* rijndael-armv8-aarch32-ce.S - ARMv8/CE accelerated AES + * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) + +.syntax unified +.arch armv8-a +.fpu crypto-neon-fp-armv8 +.arm + +.text + +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + + +/* AES macros */ + +#define aes_preload_keys(keysched, rekeysched) \ + vldmia keysched!, {q5-q7}; \ + mov rekeysched, keysched; \ + vldmialo keysched!, {q8-q15}; /* 128-bit */ \ + addeq keysched, #(2*16); \ + vldmiaeq keysched!, {q10-q15}; /* 192-bit */ \ + addhi keysched, #(4*16); \ + vldmiahi keysched!, {q12-q15}; /* 256-bit */ \ + +#define do_aes_one128(ed, mcimc, qo, qb) \ + aes##ed.8 qb, q5; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q6; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q7; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q8; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q9; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q10; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q11; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q12; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q13; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q14; \ + veor qo, qb, q15; + +#define do_aes_one128re(ed, mcimc, qo, qb, keysched, rekeysched) \ + vldm rekeysched, {q8-q9}; \ + do_aes_one128(ed, mcimc, qo, qb); + +#define do_aes_one192(ed, mcimc, qo, qb, keysched, rekeysched) \ + vldm rekeysched!, {q8}; \ + aes##ed.8 qb, q5; \ + aes##mcimc.8 qb, qb; \ + vldm rekeysched, {q9}; \ + aes##ed.8 qb, q6; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q7; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q8; \ + aes##mcimc.8 qb, qb; \ + vldmia keysched!, {q8}; \ + aes##ed.8 qb, q9; \ + aes##mcimc.8 qb, qb; \ + sub rekeysched, #(1*16); \ + aes##ed.8 qb, q10; \ + aes##mcimc.8 qb, qb; \ + vldm keysched, {q9}; \ + aes##ed.8 qb, q11; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q12; \ + aes##mcimc.8 qb, qb; \ + sub keysched, #16; \ + aes##ed.8 qb, q13; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q14; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q15; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q8; \ + veor qo, qb, q9; \ + +#define do_aes_one256(ed, mcimc, qo, qb, keysched, rekeysched) \ + vldmia rekeysched!, {q8}; \ + aes##ed.8 qb, q5; \ + aes##mcimc.8 qb, qb; \ + vldmia rekeysched!, {q9}; \ + aes##ed.8 qb, q6; \ + aes##mcimc.8 qb, qb; \ + vldmia rekeysched!, {q10}; \ + aes##ed.8 qb, q7; \ + aes##mcimc.8 qb, qb; \ + vldm rekeysched, {q11}; \ + aes##ed.8 qb, q8; \ + aes##mcimc.8 qb, qb; \ + vldmia keysched!, {q8}; \ + aes##ed.8 qb, q9; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q10; \ + aes##mcimc.8 qb, qb; \ + vldmia keysched!, {q9}; \ + aes##ed.8 qb, q11; \ + aes##mcimc.8 qb, qb; \ + sub rekeysched, #(3*16); \ + aes##ed.8 qb, q12; \ + aes##mcimc.8 qb, qb; \ + vldmia keysched!, {q10}; \ + aes##ed.8 qb, q13; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q14; \ + aes##mcimc.8 qb, qb; \ + vldm keysched, {q11}; \ + aes##ed.8 qb, q15; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q8; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q9; \ + aes##mcimc.8 qb, qb; \ + aes##ed.8 qb, q10; \ + veor qo, qb, q11; \ + sub keysched, #(3*16); \ + +#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \ + aes##ed.8 b0, key; \ + aes##mcimc.8 b0, b0; \ + aes##ed.8 b1, key; \ + aes##mcimc.8 b1, b1; \ + aes##ed.8 b2, key; \ + aes##mcimc.8 b2, b2; \ + aes##ed.8 b3, key; \ + aes##mcimc.8 b3, b3; + +#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ + aes##ed.8 b0, q14; \ + veor b0, b0, q15; \ + aes##ed.8 b1, q14; \ + veor b1, b1, q15; \ + aes##ed.8 b2, q14; \ + veor b2, b2, q15; \ + aes##ed.8 b3, q14; \ + veor b3, b3, q15; + +#define do_aes_4_128re(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ + vldm rekeysched, {q8-q9}; \ + do_aes_4_128(ed, mcimc, b0, b1, b2, b3); + +#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ + vldm rekeysched!, {q8}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ + vldm rekeysched, {q9}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ + vldmia keysched!, {q8}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ + sub rekeysched, #(1*16); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ + vldm keysched, {q9}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ + sub keysched, #16; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \ + aes##ed.8 b0, q8; \ + veor b0, b0, q9; \ + aes##ed.8 b1, q8; \ + veor b1, b1, q9; \ + aes##ed.8 b2, q8; \ + veor b2, b2, q9; \ + aes##ed.8 b3, q8; \ + veor b3, b3, q9; + +#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \ + vldmia rekeysched!, {q8}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \ + vldmia rekeysched!, {q9}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \ + vldmia rekeysched!, {q10}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \ + vldm rekeysched, {q11}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ + vldmia keysched!, {q8}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \ + vldmia keysched!, {q9}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \ + sub rekeysched, #(3*16); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \ + vldmia keysched!, {q10}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \ + vldm keysched, {q11}; \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \ + sub keysched, #(3*16); \ + aes##ed.8 b0, q10; \ + veor b0, b0, q11; \ + aes##ed.8 b1, q10; \ + veor b1, b1, q11; \ + aes##ed.8 b2, q10; \ + veor b2, b2, q11; \ + aes##ed.8 b3, q10; \ + veor b3, b3, q11; + + +/* Other functional macros */ + +#define CLEAR_REG(reg) veor reg, reg; + + +/* + * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst, + * const byte *src, + * unsigned int nrounds); + */ +.align 3 +.globl _gcry_aes_enc_armv8_ce +.type _gcry_aes_enc_armv8_ce,%function; +_gcry_aes_enc_armv8_ce: + /* input: + * r0: keysched + * r1: dst + * r2: src + * r3: nrounds + */ + + vldmia r0!, {q1-q3} /* load 3 round keys */ + + cmp r3, #12 + + vld1.8 {q0}, [r2] + + bhi .Lenc1_256 + beq .Lenc1_192 + +.Lenc1_128: + +.Lenc1_tail: + vldmia r0, {q8-q15} /* load 8 round keys */ + + aese.8 q0, q1 + aesmc.8 q0, q0 + CLEAR_REG(q1) + + aese.8 q0, q2 + aesmc.8 q0, q0 + CLEAR_REG(q2) + + aese.8 q0, q3 + aesmc.8 q0, q0 + CLEAR_REG(q3) + + aese.8 q0, q8 + aesmc.8 q0, q0 + CLEAR_REG(q8) + + aese.8 q0, q9 + aesmc.8 q0, q0 + CLEAR_REG(q9) + + aese.8 q0, q10 + aesmc.8 q0, q0 + CLEAR_REG(q10) + + aese.8 q0, q11 + aesmc.8 q0, q0 + CLEAR_REG(q11) + + aese.8 q0, q12 + aesmc.8 q0, q0 + CLEAR_REG(q12) + + aese.8 q0, q13 + aesmc.8 q0, q0 + CLEAR_REG(q13) + + aese.8 q0, q14 + veor q0, q15 + CLEAR_REG(q14) + CLEAR_REG(q15) + + vst1.8 {q0}, [r1] + CLEAR_REG(q0) + + mov r0, #0 + bx lr + +.Lenc1_192: + aese.8 q0, q1 + aesmc.8 q0, q0 + vmov q1, q3 + + aese.8 q0, q2 + aesmc.8 q0, q0 + vldm r0!, {q2-q3} /* load 3 round keys */ + + b .Lenc1_tail + +.Lenc1_256: + vldm r0!, {q15} /* load 1 round key */ + aese.8 q0, q1 + aesmc.8 q0, q0 + + aese.8 q0, q2 + aesmc.8 q0, q0 + + aese.8 q0, q3 + aesmc.8 q0, q0 + vldm r0!, {q1-q3} /* load 3 round keys */ + + aese.8 q0, q15 + aesmc.8 q0, q0 + + b .Lenc1_tail +.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce; + + +/* + * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst, + * const byte *src, + * unsigned int nrounds); + */ +.align 3 +.globl _gcry_aes_dec_armv8_ce +.type _gcry_aes_dec_armv8_ce,%function; +_gcry_aes_dec_armv8_ce: + /* input: + * r0: keysched + * r1: dst + * r2: src + * r3: nrounds + */ + + vldmia r0!, {q1-q3} /* load 3 round keys */ + + cmp r3, #12 + + vld1.8 {q0}, [r2] + + bhi .Ldec1_256 + beq .Ldec1_192 + +.Ldec1_128: + +.Ldec1_tail: + vldmia r0, {q8-q15} /* load 8 round keys */ + + aesd.8 q0, q1 + aesimc.8 q0, q0 + CLEAR_REG(q1) + + aesd.8 q0, q2 + aesimc.8 q0, q0 + CLEAR_REG(q2) + + aesd.8 q0, q3 + aesimc.8 q0, q0 + CLEAR_REG(q3) + + aesd.8 q0, q8 + aesimc.8 q0, q0 + CLEAR_REG(q8) + + aesd.8 q0, q9 + aesimc.8 q0, q0 + CLEAR_REG(q9) + + aesd.8 q0, q10 + aesimc.8 q0, q0 + CLEAR_REG(q10) + + aesd.8 q0, q11 + aesimc.8 q0, q0 + CLEAR_REG(q11) + + aesd.8 q0, q12 + aesimc.8 q0, q0 + CLEAR_REG(q12) + + aesd.8 q0, q13 + aesimc.8 q0, q0 + CLEAR_REG(q13) + + aesd.8 q0, q14 + veor q0, q15 + CLEAR_REG(q14) + CLEAR_REG(q15) + + vst1.8 {q0}, [r1] + CLEAR_REG(q0) + + mov r0, #0 + bx lr + +.Ldec1_192: + aesd.8 q0, q1 + aesimc.8 q0, q0 + vmov q1, q3 + + aesd.8 q0, q2 + aesimc.8 q0, q0 + vldm r0!, {q2-q3} /* load 3 round keys */ + + b .Ldec1_tail + +.Ldec1_256: + vldm r0!, {q15} /* load 1 round key */ + aesd.8 q0, q1 + aesimc.8 q0, q0 + + aesd.8 q0, q2 + aesimc.8 q0, q0 + + aesd.8 q0, q3 + aesimc.8 q0, q0 + vldm r0!, {q1-q3} /* load 3 round keys */ + + aesd.8 q0, q15 + aesimc.8 q0, q0 + + b .Ldec1_tail +.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce; + + +/* + * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, size_t nblocks, + * int cbc_mac, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_cbc_enc_armv8_ce +.type _gcry_aes_cbc_enc_armv8_ce,%function; +_gcry_aes_cbc_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: cbc_mac => r5 + * %st+8: nrounds => r6 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + ldr r4, [sp, #(16+0)] + ldr r5, [sp, #(16+4)] + cmp r4, #0 + ldr r6, [sp, #(16+8)] + beq .Lcbc_enc_skip + cmp r5, #0 + vpush {q4-q7} + moveq r5, #16 + movne r5, #0 + + cmp r6, #12 + vld1.8 {q1}, [r3] /* load IV */ + + aes_preload_keys(r0, lr); + + beq .Lcbc_enc_loop192 + bhi .Lcbc_enc_loop256 + +#define CBC_ENC(bits, ...) \ + .Lcbc_enc_loop##bits: \ + vld1.8 {q0}, [r2]!; /* load plaintext */ \ + veor q1, q0, q1; \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + vst1.8 {q1}, [r1], r5; /* store ciphertext */ \ + \ + bne .Lcbc_enc_loop##bits; \ + b .Lcbc_enc_done; + + CBC_ENC(128) + CBC_ENC(192, r0, lr) + CBC_ENC(256, r0, lr) + +#undef CBC_ENC + +.Lcbc_enc_done: + vst1.8 {q1}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lcbc_enc_skip: + pop {r4-r6,pc} +.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce; + + +/* + * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_cbc_dec_armv8_ce +.type _gcry_aes_cbc_dec_armv8_ce,%function; +_gcry_aes_cbc_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + ldr r4, [sp, #(16+0)] + ldr r5, [sp, #(16+4)] + cmp r4, #0 + beq .Lcbc_dec_skip + vpush {q4-q7} + + cmp r5, #12 + vld1.8 {q0}, [r3] /* load IV */ + + aes_preload_keys(r0, r6); + + beq .Lcbc_dec_entry_192 + bhi .Lcbc_dec_entry_256 + +#define CBC_DEC(bits, ...) \ + .Lcbc_dec_entry_##bits: \ + cmp r4, #4; \ + blo .Lcbc_dec_loop_##bits; \ + \ + .Lcbc_dec_loop4_##bits: \ + \ + vld1.8 {q1-q2}, [r2]!; /* load ciphertext */ \ + sub r4, r4, #4; \ + vld1.8 {q3-q4}, [r2]; /* load ciphertext */ \ + cmp r4, #4; \ + sub r2, #32; \ + \ + do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + veor q1, q1, q0; \ + vld1.8 {q0}, [r2]!; /* load next IV */ \ + veor q2, q2, q0; \ + vld1.8 {q0}, [r2]!; /* load next IV */ \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + veor q3, q3, q0; \ + vld1.8 {q0}, [r2]!; /* load next IV */ \ + veor q4, q4, q0; \ + vld1.8 {q0}, [r2]!; /* load next IV */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lcbc_dec_loop4_##bits; \ + cmp r4, #0; \ + beq .Lcbc_dec_done; \ + \ + .Lcbc_dec_loop_##bits: \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + subs r4, r4, #1; \ + vmov q2, q1; \ + \ + do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q0; \ + vmov q0, q2; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lcbc_dec_loop_##bits; \ + b .Lcbc_dec_done; + + CBC_DEC(128) + CBC_DEC(192, r0, r6) + CBC_DEC(256, r0, r6) + +#undef CBC_DEC + +.Lcbc_dec_done: + vst1.8 {q0}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lcbc_dec_skip: + pop {r4-r6,pc} +.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce; + + +/* + * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_cfb_enc_armv8_ce +.type _gcry_aes_cfb_enc_armv8_ce,%function; +_gcry_aes_cfb_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + ldr r4, [sp, #(16+0)] + ldr r5, [sp, #(16+4)] + cmp r4, #0 + beq .Lcfb_enc_skip + vpush {q4-q7} + + cmp r5, #12 + vld1.8 {q0}, [r3] /* load IV */ + + aes_preload_keys(r0, r6); + + beq .Lcfb_enc_entry_192 + bhi .Lcfb_enc_entry_256 + +#define CFB_ENC(bits, ...) \ + .Lcfb_enc_entry_##bits: \ + .Lcfb_enc_loop_##bits: \ + vld1.8 {q1}, [r2]!; /* load plaintext */ \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \ + \ + veor q0, q1, q0; \ + vst1.8 {q0}, [r1]!; /* store ciphertext */ \ + \ + bne .Lcfb_enc_loop_##bits; \ + b .Lcfb_enc_done; + + CFB_ENC(128) + CFB_ENC(192, r0, r6) + CFB_ENC(256, r0, r6) + +#undef CFB_ENC + +.Lcfb_enc_done: + vst1.8 {q0}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lcfb_enc_skip: + pop {r4-r6,pc} +.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce; + + +/* + * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_cfb_dec_armv8_ce +.type _gcry_aes_cfb_dec_armv8_ce,%function; +_gcry_aes_cfb_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + ldr r4, [sp, #(16+0)] + ldr r5, [sp, #(16+4)] + cmp r4, #0 + beq .Lcfb_dec_skip + vpush {q4-q7} + + cmp r5, #12 + vld1.8 {q0}, [r3] /* load IV */ + + aes_preload_keys(r0, r6); + + beq .Lcfb_dec_entry_192 + bhi .Lcfb_dec_entry_256 + +#define CFB_DEC(bits, ...) \ + .Lcfb_dec_entry_##bits: \ + cmp r4, #4; \ + blo .Lcfb_dec_loop_##bits; \ + \ + .Lcfb_dec_loop4_##bits: \ + \ + vld1.8 {q2-q3}, [r2]!; /* load ciphertext */ \ + vmov q1, q0; \ + sub r4, r4, #4; \ + vld1.8 {q4}, [r2]; /* load ciphertext */ \ + sub r2, #32; \ + cmp r4, #4; \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + veor q1, q1, q0; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + veor q2, q2, q0; \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + vld1.8 {q0}, [r2]!; \ + veor q3, q3, q0; \ + vld1.8 {q0}, [r2]!; /* load next IV / ciphertext */ \ + veor q4, q4, q0; \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lcfb_dec_loop4_##bits; \ + cmp r4, #0; \ + beq .Lcfb_dec_done; \ + \ + .Lcfb_dec_loop_##bits: \ + \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \ + \ + veor q2, q1, q0; \ + vmov q0, q1; \ + vst1.8 {q2}, [r1]!; /* store plaintext */ \ + \ + bne .Lcfb_dec_loop_##bits; \ + b .Lcfb_dec_done; + + CFB_DEC(128) + CFB_DEC(192, r0, r6) + CFB_DEC(256, r0, r6) + +#undef CFB_DEC + +.Lcfb_dec_done: + vst1.8 {q0}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lcfb_dec_skip: + pop {r4-r6,pc} +.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce; + + +/* + * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ctr_enc_armv8_ce +.type _gcry_aes_ctr_enc_armv8_ce,%function; +_gcry_aes_ctr_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lctr_enc_skip + + cmp r5, #12 + ldm r3, {r7-r10} + vld1.8 {q0}, [r3] /* load IV */ + rev r7, r7 + rev r8, r8 + rev r9, r9 + rev r10, r10 + + aes_preload_keys(r0, r6); + + beq .Lctr_enc_entry_192 + bhi .Lctr_enc_entry_256 + +#define CTR_ENC(bits, ...) \ + .Lctr_enc_entry_##bits: \ + cmp r4, #4; \ + blo .Lctr_enc_loop_##bits; \ + \ + .Lctr_enc_loop4_##bits: \ + cmp r10, #0xfffffffc; \ + sub r4, r4, #4; \ + blo .Lctr_enc_loop4_##bits##_nocarry; \ + cmp r9, #0xffffffff; \ + bne .Lctr_enc_loop4_##bits##_nocarry; \ + \ + adds r10, #1; \ + vmov q1, q0; \ + blcs .Lctr_overflow_one; \ + rev r11, r10; \ + vmov.32 d1[1], r11; \ + \ + adds r10, #1; \ + vmov q2, q0; \ + blcs .Lctr_overflow_one; \ + rev r11, r10; \ + vmov.32 d1[1], r11; \ + \ + adds r10, #1; \ + vmov q3, q0; \ + blcs .Lctr_overflow_one; \ + rev r11, r10; \ + vmov.32 d1[1], r11; \ + \ + adds r10, #1; \ + vmov q4, q0; \ + blcs .Lctr_overflow_one; \ + rev r11, r10; \ + vmov.32 d1[1], r11; \ + \ + b .Lctr_enc_loop4_##bits##_store_ctr; \ + \ + .Lctr_enc_loop4_##bits##_nocarry: \ + \ + veor q2, q2; \ + vrev64.8 q1, q0; \ + vceq.u32 d5, d5; \ + vadd.u64 q3, q2, q2; \ + vadd.u64 q4, q3, q2; \ + vadd.u64 q0, q3, q3; \ + vsub.u64 q2, q1, q2; \ + vsub.u64 q3, q1, q3; \ + vsub.u64 q4, q1, q4; \ + vsub.u64 q0, q1, q0; \ + vrev64.8 q1, q1; \ + vrev64.8 q2, q2; \ + vrev64.8 q3, q3; \ + vrev64.8 q0, q0; \ + vrev64.8 q4, q4; \ + add r10, #4; \ + \ + .Lctr_enc_loop4_##bits##_store_ctr: \ + \ + vst1.8 {q0}, [r3]; \ + cmp r4, #4; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + veor q1, q1, q0; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + veor q2, q2, q0; \ + veor q3, q3, q1; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + vst1.8 {q2}, [r1]!; /* store plaintext */ \ + veor q4, q4, q0; \ + vld1.8 {q0}, [r3]; /* reload IV */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lctr_enc_loop4_##bits; \ + cmp r4, #0; \ + beq .Lctr_enc_done; \ + \ + .Lctr_enc_loop_##bits: \ + \ + adds r10, #1; \ + vmov q1, q0; \ + blcs .Lctr_overflow_one; \ + rev r11, r10; \ + subs r4, r4, #1; \ + vld1.8 {q2}, [r2]!; /* load ciphertext */ \ + vmov.32 d1[1], r11; \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q2, q1; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lctr_enc_loop_##bits; \ + b .Lctr_enc_done; + + CTR_ENC(128) + CTR_ENC(192, r0, r6) + CTR_ENC(256, r0, r6) + +#undef CTR_ENC + +.Lctr_enc_done: + vst1.8 {q0}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lctr_enc_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr + +.Lctr_overflow_one: + adcs r9, #0 + adcs r8, #0 + adc r7, #0 + rev r11, r9 + rev r12, r8 + vmov.32 d1[0], r11 + rev r11, r7 + vmov.32 d0[1], r12 + vmov.32 d0[0], r11 + bx lr +.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce; + + +/* + * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *offset, + * unsigned char *checksum, + * unsigned char *L_table, + * size_t nblocks, + * unsigned int nrounds, + * unsigned int blkn); + */ + +.align 3 +.globl _gcry_aes_ocb_enc_armv8_ce +.type _gcry_aes_ocb_enc_armv8_ce,%function; +_gcry_aes_ocb_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: offset + * %st+0: checksum => r4 + * %st+4: Ls => r5 + * %st+8: nblocks => r6 (0 < nblocks <= 32) + * %st+12: nrounds => r7 + * %st+16: blkn => lr + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r7, [sp, #(104+12)] + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + ldr r6, [sp, #(104+8)] + ldr lr, [sp, #(104+16)] + + cmp r7, #12 + vld1.8 {q0}, [r3] /* load offset */ + + aes_preload_keys(r0, r12); + + beq .Locb_enc_entry_192 + bhi .Locb_enc_entry_256 + +#define OCB_ENC(bits, ...) \ + .Locb_enc_entry_##bits: \ + cmp r6, #4; \ + add lr, #1; \ + blo .Locb_enc_loop_##bits; \ + \ + .Locb_enc_loop4_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* Checksum_i = Checksum_{i-1} xor P_i */ \ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ + \ + add r9, lr, #1; \ + add r10, lr, #2; \ + add r11, lr, #3; \ + rbit r8, lr; \ + add lr, lr, #4; \ + rbit r9, r9; \ + rbit r10, r10; \ + rbit r11, r11; \ + clz r8, r8; /* ntz(i+0) */ \ + clz r9, r9; /* ntz(i+1) */ \ + clz r10, r10; /* ntz(i+2) */ \ + clz r11, r11; /* ntz(i+3) */ \ + add r8, r5, r8, lsl #4; \ + add r9, r5, r9, lsl #4; \ + add r10, r5, r10, lsl #4; \ + add r11, r5, r11, lsl #4; \ + \ + sub r6, #4; \ + \ + vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ + vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \ + vld1.8 {q8}, [r4]; /* load Checksum_{i-1} */ \ + veor q0, q0, q9; /* Offset_i+0 */ \ + vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ + veor q8, q8, q1; /* Checksum_i+0 */ \ + veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\ + vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \ + vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\ + veor q0, q0, q9; /* Offset_i+1 */ \ + vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ + veor q8, q8, q2; /* Checksum_i+1 */ \ + veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\ + vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\ + veor q0, q0, q9; /* Offset_i+2 */ \ + vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ + veor q8, q8, q3; /* Checksum_i+2 */ \ + veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\ + vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\ + veor q0, q0, q9; /* Offset_i+3 */ \ + veor q8, q8, q4; /* Checksum_i+3 */ \ + veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\ + vst1.8 {q0}, [r1]; /* store Offset_i+3 */\ + sub r1, #(3*16); \ + vst1.8 {q8}, [r4]; /* store Checksum_i+3 */\ + \ + cmp r6, #4; \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + mov r8, r1; \ + vld1.8 {q8-q9}, [r1]!; \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]!; \ + vst1.8 {q1-q2}, [r8]!; \ + veor q3, q3, q8; \ + veor q4, q4, q9; \ + vst1.8 {q3-q4}, [r8]; \ + \ + bhs .Locb_enc_loop4_##bits; \ + cmp r6, #0; \ + beq .Locb_enc_done; \ + \ + .Locb_enc_loop_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* Checksum_i = Checksum_{i-1} xor P_i */ \ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ + \ + rbit r8, lr; \ + add lr, #1; \ + clz r8, r8; /* ntz(i) */ \ + add r8, r5, r8, lsl #4; \ + \ + vld1.8 {q1}, [r2]!; /* load plaintext */ \ + vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ + vld1.8 {q3}, [r4]; /* load checksum */ \ + subs r6, #1; \ + veor q0, q0, q2; \ + veor q3, q3, q1; \ + veor q1, q1, q0; \ + vst1.8 {q3}, [r4]; /* store checksum */ \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q0; \ + vst1.8 {q1}, [r1]!; /* store ciphertext */ \ + \ + bne .Locb_enc_loop_##bits; \ + b .Locb_enc_done; + + OCB_ENC(128re, r0, r12) + OCB_ENC(192, r0, r12) + OCB_ENC(256, r0, r12) + +#undef OCB_ENC + +.Locb_enc_done: + vst1.8 {q0}, [r3] /* store offset */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce; + + +/* + * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *offset, + * unsigned char *checksum, + * unsigned char *L_table, + * size_t nblocks, + * unsigned int nrounds, + * unsigned int blkn); + */ + +.align 3 +.globl _gcry_aes_ocb_dec_armv8_ce +.type _gcry_aes_ocb_dec_armv8_ce,%function; +_gcry_aes_ocb_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: offset + * %st+0: checksum => r4 + * %st+4: Ls => r5 + * %st+8: nblocks => r6 (0 < nblocks <= 32) + * %st+12: nrounds => r7 + * %st+16: blkn => lr + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r7, [sp, #(104+12)] + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + ldr r6, [sp, #(104+8)] + ldr lr, [sp, #(104+16)] + + cmp r7, #12 + vld1.8 {q0}, [r3] /* load offset */ + + aes_preload_keys(r0, r12); + + beq .Locb_dec_entry_192 + bhi .Locb_dec_entry_256 + +#define OCB_DEC(bits, ...) \ + .Locb_dec_entry_##bits: \ + cmp r6, #4; \ + add lr, #1; \ + blo .Locb_dec_loop_##bits; \ + \ + .Locb_dec_loop4_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ + /* Checksum_i = Checksum_{i-1} xor P_i */ \ + \ + add r9, lr, #1; \ + add r10, lr, #2; \ + add r11, lr, #3; \ + rbit r8, lr; \ + add lr, lr, #4; \ + rbit r9, r9; \ + rbit r10, r10; \ + rbit r11, r11; \ + clz r8, r8; /* ntz(i+0) */ \ + clz r9, r9; /* ntz(i+1) */ \ + clz r10, r10; /* ntz(i+2) */ \ + clz r11, r11; /* ntz(i+3) */ \ + add r8, r5, r8, lsl #4; \ + add r9, r5, r9, lsl #4; \ + add r10, r5, r10, lsl #4; \ + add r11, r5, r11, lsl #4; \ + \ + sub r6, #4; \ + \ + vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ + vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \ + veor q0, q0, q9; /* Offset_i+0 */ \ + vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ + veor q1, q1, q0; /* P_i+0 xor Offset_i+0 */\ + vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \ + vst1.8 {q0}, [r1]!; /* store Offset_i+0 */\ + veor q0, q0, q9; /* Offset_i+1 */ \ + vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ + veor q2, q2, q0; /* P_i+1 xor Offset_i+1 */\ + vst1.8 {q0}, [r1]!; /* store Offset_i+1 */\ + veor q0, q0, q9; /* Offset_i+2 */ \ + vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ + veor q3, q3, q0; /* P_i+2 xor Offset_i+2 */\ + vst1.8 {q0}, [r1]!; /* store Offset_i+2 */\ + veor q0, q0, q9; /* Offset_i+3 */ \ + veor q4, q4, q0; /* P_i+3 xor Offset_i+3 */\ + vst1.8 {q0}, [r1]; /* store Offset_i+3 */\ + sub r1, #(3*16); \ + \ + cmp r6, #4; \ + \ + do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + mov r8, r1; \ + vld1.8 {q8-q9}, [r1]!; \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]!; \ + vst1.8 {q1-q2}, [r8]!; \ + veor q1, q1, q2; \ + vld1.8 {q2}, [r4]; /* load Checksum_{i-1} */ \ + veor q3, q3, q8; \ + veor q1, q1, q3; \ + veor q4, q4, q9; \ + veor q1, q1, q4; \ + vst1.8 {q3-q4}, [r8]; \ + veor q2, q2, q1; \ + vst1.8 {q2}, [r4]; /* store Checksum_i+3 */ \ + \ + bhs .Locb_dec_loop4_##bits; \ + cmp r6, #0; \ + beq .Locb_dec_done; \ + \ + .Locb_dec_loop_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ + /* Checksum_i = Checksum_{i-1} xor P_i */ \ + \ + rbit r8, lr; \ + add lr, #1; \ + clz r8, r8; /* ntz(i) */ \ + add r8, r5, r8, lsl #4; \ + \ + vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + subs r6, #1; \ + veor q0, q0, q2; \ + veor q1, q1, q0; \ + \ + do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__) \ + \ + vld1.8 {q2}, [r4]; /* load checksum */ \ + veor q1, q1, q0; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + veor q2, q2, q1; \ + vst1.8 {q2}, [r4]; /* store checksum */ \ + \ + bne .Locb_dec_loop_##bits; \ + b .Locb_dec_done; + + OCB_DEC(128re, r0, r12) + OCB_DEC(192, r0, r12) + OCB_DEC(256, r0, r12) + +#undef OCB_DEC + +.Locb_dec_done: + vst1.8 {q0}, [r3] /* store offset */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce; + + +/* + * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * const unsigned char *abuf, + * unsigned char *offset, + * unsigned char *checksum, + * unsigned char *L_table, + * size_t nblocks, + * unsigned int nrounds, + * unsigned int blkn); + */ + +.align 3 +.globl _gcry_aes_ocb_auth_armv8_ce +.type _gcry_aes_ocb_auth_armv8_ce,%function; +_gcry_aes_ocb_auth_armv8_ce: + /* input: + * r0: keysched + * r1: abuf + * r2: offset + * r3: checksum + * %st+0: Ls => r5 + * %st+4: nblocks => r6 (0 < nblocks <= 32) + * %st+8: nrounds => r7 + * %st+12: blkn => lr + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r7, [sp, #(104+8)] + ldr r5, [sp, #(104+0)] + ldr r6, [sp, #(104+4)] + ldr lr, [sp, #(104+12)] + + cmp r7, #12 + vld1.8 {q0}, [r2] /* load offset */ + + aes_preload_keys(r0, r12); + + beq .Locb_auth_entry_192 + bhi .Locb_auth_entry_256 + +#define OCB_AUTH(bits, ...) \ + .Locb_auth_entry_##bits: \ + cmp r6, #4; \ + add lr, #1; \ + blo .Locb_auth_loop_##bits; \ + \ + .Locb_auth_loop4_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ + \ + add r9, lr, #1; \ + add r10, lr, #2; \ + add r11, lr, #3; \ + rbit r8, lr; \ + add lr, lr, #4; \ + rbit r9, r9; \ + rbit r10, r10; \ + rbit r11, r11; \ + clz r8, r8; /* ntz(i+0) */ \ + clz r9, r9; /* ntz(i+1) */ \ + clz r10, r10; /* ntz(i+2) */ \ + clz r11, r11; /* ntz(i+3) */ \ + add r8, r5, r8, lsl #4; \ + add r9, r5, r9, lsl #4; \ + add r10, r5, r10, lsl #4; \ + add r11, r5, r11, lsl #4; \ + \ + sub r6, #4; \ + \ + vld1.8 {q9}, [r8]; /* load L_{ntz(i+0)} */ \ + vld1.8 {q1-q2}, [r1]!; /* load A_i+<0-1> */ \ + veor q0, q0, q9; /* Offset_i+0 */ \ + vld1.8 {q9}, [r9]; /* load L_{ntz(i+1)} */ \ + veor q1, q1, q0; /* A_i+0 xor Offset_i+0 */\ + vld1.8 {q3-q4}, [r1]!; /* load A_i+<2-3> */ \ + veor q0, q0, q9; /* Offset_i+1 */ \ + vld1.8 {q9}, [r10]; /* load L_{ntz(i+2)} */ \ + veor q2, q2, q0; /* A_i+1 xor Offset_i+1 */\ + veor q0, q0, q9; /* Offset_i+2 */ \ + vld1.8 {q9}, [r11]; /* load L_{ntz(i+3)} */ \ + veor q3, q3, q0; /* A_i+2 xor Offset_i+2 */\ + veor q0, q0, q9; /* Offset_i+3 */ \ + veor q4, q4, q0; /* A_i+3 xor Offset_i+3 */\ + \ + cmp r6, #4; \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + veor q1, q1, q2; \ + veor q3, q3, q4; \ + vld1.8 {q2}, [r3]; \ + veor q1, q1, q3; \ + veor q2, q2, q1; \ + vst1.8 {q2}, [r3]; \ + \ + bhs .Locb_auth_loop4_##bits; \ + cmp r6, #0; \ + beq .Locb_auth_done; \ + \ + .Locb_auth_loop_##bits: \ + \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ \ + \ + rbit r8, lr; \ + add lr, #1; \ + clz r8, r8; /* ntz(i) */ \ + add r8, r5, r8, lsl #4; \ + \ + vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \ + vld1.8 {q1}, [r1]!; /* load aadtext */ \ + subs r6, #1; \ + veor q0, q0, q2; \ + vld1.8 {q2}, [r3]; /* load checksum */ \ + veor q1, q1, q0; \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__) \ + \ + veor q2, q2, q1; \ + vst1.8 {q2}, [r3]; /* store checksum */ \ + \ + bne .Locb_auth_loop_##bits; \ + b .Locb_auth_done; + + OCB_AUTH(128re, r0, r12) + OCB_AUTH(192, r0, r12) + OCB_AUTH(256, r0, r12) + +#undef OCB_AUTH + +.Locb_auth_done: + vst1.8 {q0}, [r2] /* store offset */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce; + + + +/* + * void _gcry_aes_xts_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_enc_armv8_ce +.type _gcry_aes_xts_enc_armv8_ce,%function; +_gcry_aes_xts_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lxts_enc_skip + + cmp r5, #12 + + vld1.8 {q0}, [r3] /* load tweak */ + mov r7, #0x87; + + aes_preload_keys(r0, r6); + + beq .Lxts_enc_entry_192 + bhi .Lxts_enc_entry_256 + +#define CTR_XTS(bits, ...) \ + .Lxts_enc_entry_##bits: \ + cmp r4, #4; \ + blo .Lxts_enc_loop_##bits; \ + \ + .Lxts_enc_loop4_##bits: \ + sub r4, r4, #4; \ + veor q9, q9, q9; \ + \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + veor q1, q1, q0; \ + cmp r4, #4; \ + vmov.u32 d18[0], r7; \ + vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + veor q2, q2, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q3, q3, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q4, q4, q0; \ + vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \ + sub r1, r1, #48; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \ + sub r1, r1, #32; \ + veor q3, q3, q8; \ + veor q4, q4, q9; \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lxts_enc_loop4_##bits; \ + cmp r4, #0; \ + beq .Lxts_enc_done; \ + \ + .Lxts_enc_loop_##bits: \ + \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + \ + veor q9, q9, q9; \ + veor q1, q1, q0; \ + vmov.u32 d18[0], r7; \ + vmov q2, q0; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q2; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lxts_enc_loop_##bits; \ + b .Lxts_enc_done; + + CTR_XTS(128re, r0, r6) + CTR_XTS(192, r0, r6) + CTR_XTS(256, r0, r6) + +#undef CTR_XTS + +.Lxts_enc_done: + vst1.8 {q0}, [r3] /* store tweak */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lxts_enc_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce; + + +/* + * void _gcry_aes_xts_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_xts_dec_armv8_ce +.type _gcry_aes_xts_dec_armv8_ce,%function; +_gcry_aes_xts_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lxts_dec_skip + + cmp r5, #12 + + vld1.8 {q0}, [r3] /* load tweak */ + mov r7, #0x87; + + aes_preload_keys(r0, r6); + + beq .Lxts_dec_entry_192 + bhi .Lxts_dec_entry_256 + +#define CTR_XTS(bits, ...) \ + .Lxts_dec_entry_##bits: \ + cmp r4, #4; \ + blo .Lxts_dec_loop_##bits; \ + \ + .Lxts_dec_loop4_##bits: \ + sub r4, r4, #4; \ + veor q9, q9, q9; \ + \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + veor q1, q1, q0; \ + cmp r4, #4; \ + vmov.u32 d18[0], r7; \ + vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + veor q2, q2, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q3, q3, q0; \ + vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + veor q4, q4, q0; \ + vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \ + sub r1, r1, #48; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + \ + do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \ + veor q1, q1, q8; \ + veor q2, q2, q9; \ + vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \ + sub r1, r1, #32; \ + veor q3, q3, q8; \ + veor q4, q4, q9; \ + vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lxts_dec_loop4_##bits; \ + cmp r4, #0; \ + beq .Lxts_dec_done; \ + \ + .Lxts_dec_loop_##bits: \ + \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + \ + veor q9, q9, q9; \ + veor q1, q1, q0; \ + vmov.u32 d18[0], r7; \ + vmov q2, q0; \ + \ + vshr.s64 d16, d1, #63; \ + vshr.u64 d17, d0, #63; \ + vadd.u64 q0, q0, q0; \ + vand d16, d16, d18; \ + veor q0, q0, q8; \ + subs r4, r4, #1; \ + \ + do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q1, q2; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lxts_dec_loop_##bits; \ + b .Lxts_dec_done; + + CTR_XTS(128re, r0, r6) + CTR_XTS(192, r0, r6) + CTR_XTS(256, r0, r6) + +#undef CTR_XTS + +.Lxts_dec_done: + vst1.8 {q0}, [r3] /* store tweak */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lxts_dec_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce; + + +/* + * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b); + */ +.align 3 +.globl _gcry_aes_sbox4_armv8_ce +.type _gcry_aes_sbox4_armv8_ce,%function; +_gcry_aes_sbox4_armv8_ce: + /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in + * Cryptology — CT-RSA 2015" for details. + */ + vmov.i8 q0, #0x52 + vmov.i8 q1, #0 + vmov s0, r0 + aese.8 q0, q1 + veor d0, d1 + vpadd.i32 d0, d0, d1 + vmov r0, s0 + CLEAR_REG(q0) + bx lr +.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce; + + +/* + * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src); + */ +.align 3 +.globl _gcry_aes_invmixcol_armv8_ce +.type _gcry_aes_invmixcol_armv8_ce,%function; +_gcry_aes_invmixcol_armv8_ce: + vld1.8 {q0}, [r1] + aesimc.8 q0, q0 + vst1.8 {q0}, [r0] + CLEAR_REG(q0) + bx lr +.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce; + +#endif |