summaryrefslogtreecommitdiffstats
path: root/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S
diff options
context:
space:
mode:
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/rijndael-aarch64.S')
-rw-r--r--comm/third_party/libgcrypt/cipher/rijndael-aarch64.S514
1 files changed, 514 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S b/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S
new file mode 100644
index 0000000000..e77dd4e0b8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S
@@ -0,0 +1,514 @@
+/* rijndael-aarch64.S - ARMv8/AArch64 assembly implementation of AES cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* register macros */
+#define CTX x0
+#define RDST x1
+#define RSRC x2
+#define NROUNDS w3
+#define RTAB x4
+#define RMASK w5
+
+#define RA w8
+#define RB w9
+#define RC w10
+#define RD w11
+
+#define RNA w12
+#define RNB w13
+#define RNC w14
+#define RND w15
+
+#define RT0 w6
+#define RT1 w7
+#define RT2 w16
+#define xRT0 x6
+#define xRT1 x7
+#define xRT2 x16
+
+#define xw8 x8
+#define xw9 x9
+#define xw10 x10
+#define xw11 x11
+
+#define xw12 x12
+#define xw13 x13
+#define xw14 x14
+#define xw15 x15
+
+/***********************************************************************
+ * ARMv8/AArch64 assembly implementation of the AES cipher
+ ***********************************************************************/
+#define preload_first_key(round, ra) \
+ ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
+
+#define dummy(round, ra) /* nothing */
+
+#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+ ldp rna, rnb, [CTX]; \
+ ldp rnc, rnd, [CTX, #8]; \
+ eor ra, ra, rna; \
+ eor rb, rb, rnb; \
+ eor rc, rc, rnc; \
+ preload_key(1, rna); \
+ eor rd, rd, rnd;
+
+#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+ ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+ \
+ and RT0, RMASK, ra, lsl#2; \
+ ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+ and RT1, RMASK, ra, lsr#(8 - 2); \
+ ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+ and RT2, RMASK, ra, lsr#(16 - 2); \
+ ldr RT0, [RTAB, xRT0]; \
+ and ra, RMASK, ra, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rna, rna, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rd, lsl#2; \
+ ldr ra, [RTAB, x##ra]; \
+ \
+ eor rnd, rnd, RT1, ror #24; \
+ and RT1, RMASK, rd, lsr#(8 - 2); \
+ eor rnc, rnc, RT2, ror #16; \
+ and RT2, RMASK, rd, lsr#(16 - 2); \
+ eor rnb, rnb, ra, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rd, RMASK, rd, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnd, rnd, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rc, lsl#2; \
+ ldr rd, [RTAB, x##rd]; \
+ \
+ eor rnc, rnc, RT1, ror #24; \
+ and RT1, RMASK, rc, lsr#(8 - 2); \
+ eor rnb, rnb, RT2, ror #16; \
+ and RT2, RMASK, rc, lsr#(16 - 2); \
+ eor rna, rna, rd, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rc, RMASK, rc, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnc, rnc, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rb, lsl#2; \
+ ldr rc, [RTAB, x##rc]; \
+ \
+ eor rnb, rnb, RT1, ror #24; \
+ and RT1, RMASK, rb, lsr#(8 - 2); \
+ eor rna, rna, RT2, ror #16; \
+ and RT2, RMASK, rb, lsr#(16 - 2); \
+ eor rnd, rnd, rc, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rb, RMASK, rb, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnb, rnb, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ eor rna, rna, RT1, ror #24; \
+ ldr rb, [RTAB, x##rb]; \
+ \
+ eor rnd, rnd, RT2, ror #16; \
+ preload_key((next_r) + 1, ra); \
+ eor rnc, rnc, rb, ror #8;
+
+#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ and RT0, RMASK, ra, lsl#2; \
+ and RT1, RMASK, ra, lsr#(8 - 2); \
+ and RT2, RMASK, ra, lsr#(16 - 2); \
+ ldrb rna, [RTAB, xRT0]; \
+ and ra, RMASK, ra, lsr#(24 - 2); \
+ ldrb rnd, [RTAB, xRT1]; \
+ and RT0, RMASK, rd, lsl#2; \
+ ldrb rnc, [RTAB, xRT2]; \
+ ror rnd, rnd, #24; \
+ ldrb rnb, [RTAB, x##ra]; \
+ and RT1, RMASK, rd, lsr#(8 - 2); \
+ ror rnc, rnc, #16; \
+ and RT2, RMASK, rd, lsr#(16 - 2); \
+ ror rnb, rnb, #8; \
+ ldrb RT0, [RTAB, xRT0]; \
+ and rd, RMASK, rd, lsr#(24 - 2); \
+ ldrb RT1, [RTAB, xRT1]; \
+ \
+ orr rnd, rnd, RT0; \
+ ldrb RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rc, lsl#2; \
+ ldrb rd, [RTAB, x##rd]; \
+ orr rnc, rnc, RT1, ror #24; \
+ and RT1, RMASK, rc, lsr#(8 - 2); \
+ orr rnb, rnb, RT2, ror #16; \
+ and RT2, RMASK, rc, lsr#(16 - 2); \
+ orr rna, rna, rd, ror #8; \
+ ldrb RT0, [RTAB, xRT0]; \
+ and rc, RMASK, rc, lsr#(24 - 2); \
+ ldrb RT1, [RTAB, xRT1]; \
+ \
+ orr rnc, rnc, RT0; \
+ ldrb RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rb, lsl#2; \
+ ldrb rc, [RTAB, x##rc]; \
+ orr rnb, rnb, RT1, ror #24; \
+ and RT1, RMASK, rb, lsr#(8 - 2); \
+ orr rna, rna, RT2, ror #16; \
+ ldrb RT0, [RTAB, xRT0]; \
+ and RT2, RMASK, rb, lsr#(16 - 2); \
+ ldrb RT1, [RTAB, xRT1]; \
+ orr rnd, rnd, rc, ror #8; \
+ ldrb RT2, [RTAB, xRT2]; \
+ and rb, RMASK, rb, lsr#(24 - 2); \
+ ldrb rb, [RTAB, x##rb]; \
+ \
+ orr rnb, rnb, RT0; \
+ orr rna, rna, RT1, ror #24; \
+ orr rnd, rnd, RT2, ror #16; \
+ orr rnc, rnc, rb, ror #8;
+
+#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
+ do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+ do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ add CTX, CTX, #(((round) + 1) * 16); \
+ add RTAB, RTAB, #1; \
+ do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+ addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.globl _gcry_aes_arm_encrypt_block
+ELF(.type _gcry_aes_arm_encrypt_block,%function;)
+
+_gcry_aes_arm_encrypt_block:
+ /* input:
+ * %x0: keysched, CTX
+ * %x1: dst
+ * %x2: src
+ * %w3: number of rounds.. 10, 12 or 14
+ * %x4: encryption table
+ */
+ CFI_STARTPROC();
+
+ /* read input block */
+
+ /* aligned load */
+ ldp RA, RB, [RSRC];
+ ldp RC, RD, [RSRC, #8];
+#ifndef __AARCH64EL__
+ rev RA, RA;
+ rev RB, RB;
+ rev RC, RC;
+ rev RD, RD;
+#endif
+
+ mov RMASK, #(0xff<<2);
+
+ firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+ encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+
+ cmp NROUNDS, #12;
+ bge .Lenc_not_128;
+
+ encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+ lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+.Lenc_done:
+
+ /* store output block */
+
+ /* aligned store */
+#ifndef __AARCH64EL__
+ rev RA, RA;
+ rev RB, RB;
+ rev RC, RC;
+ rev RD, RD;
+#endif
+ /* write output block */
+ stp RA, RB, [RDST];
+ stp RC, RD, [RDST, #8];
+
+ mov x0, #(0);
+ ret;
+
+.ltorg
+.Lenc_not_128:
+ beq .Lenc_192
+
+ encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+ lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+ b .Lenc_done;
+
+.ltorg
+.Lenc_192:
+ encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+ lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+ b .Lenc_done;
+ CFI_ENDPROC();
+ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;)
+
+#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
+ ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
+ eor ra, ra, rna; \
+ ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
+ eor rb, rb, rnb; \
+ ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
+ eor rc, rc, rnc; \
+ preload_first_key((round) - 1, rna); \
+ eor rd, rd, rnd;
+
+#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+ ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+ \
+ and RT0, RMASK, ra, lsl#2; \
+ ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+ and RT1, RMASK, ra, lsr#(8 - 2); \
+ ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+ and RT2, RMASK, ra, lsr#(16 - 2); \
+ ldr RT0, [RTAB, xRT0]; \
+ and ra, RMASK, ra, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rna, rna, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rb, lsl#2; \
+ ldr ra, [RTAB, x##ra]; \
+ \
+ eor rnb, rnb, RT1, ror #24; \
+ and RT1, RMASK, rb, lsr#(8 - 2); \
+ eor rnc, rnc, RT2, ror #16; \
+ and RT2, RMASK, rb, lsr#(16 - 2); \
+ eor rnd, rnd, ra, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rb, RMASK, rb, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnb, rnb, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rc, lsl#2; \
+ ldr rb, [RTAB, x##rb]; \
+ \
+ eor rnc, rnc, RT1, ror #24; \
+ and RT1, RMASK, rc, lsr#(8 - 2); \
+ eor rnd, rnd, RT2, ror #16; \
+ and RT2, RMASK, rc, lsr#(16 - 2); \
+ eor rna, rna, rb, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rc, RMASK, rc, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnc, rnc, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rd, lsl#2; \
+ ldr rc, [RTAB, x##rc]; \
+ \
+ eor rnd, rnd, RT1, ror #24; \
+ and RT1, RMASK, rd, lsr#(8 - 2); \
+ eor rna, rna, RT2, ror #16; \
+ and RT2, RMASK, rd, lsr#(16 - 2); \
+ eor rnb, rnb, rc, ror #8; \
+ ldr RT0, [RTAB, xRT0]; \
+ and rd, RMASK, rd, lsr#(24 - 2); \
+ \
+ ldr RT1, [RTAB, xRT1]; \
+ eor rnd, rnd, RT0; \
+ ldr RT2, [RTAB, xRT2]; \
+ eor rna, rna, RT1, ror #24; \
+ ldr rd, [RTAB, x##rd]; \
+ \
+ eor rnb, rnb, RT2, ror #16; \
+ preload_key((next_r) - 1, ra); \
+ eor rnc, rnc, rd, ror #8;
+
+#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ and RT0, RMASK, ra; \
+ and RT1, RMASK, ra, lsr#8; \
+ and RT2, RMASK, ra, lsr#16; \
+ ldrb rna, [RTAB, xRT0]; \
+ lsr ra, ra, #24; \
+ ldrb rnb, [RTAB, xRT1]; \
+ and RT0, RMASK, rb; \
+ ldrb rnc, [RTAB, xRT2]; \
+ ror rnb, rnb, #24; \
+ ldrb rnd, [RTAB, x##ra]; \
+ and RT1, RMASK, rb, lsr#8; \
+ ror rnc, rnc, #16; \
+ and RT2, RMASK, rb, lsr#16; \
+ ror rnd, rnd, #8; \
+ ldrb RT0, [RTAB, xRT0]; \
+ lsr rb, rb, #24; \
+ ldrb RT1, [RTAB, xRT1]; \
+ \
+ orr rnb, rnb, RT0; \
+ ldrb RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rc; \
+ ldrb rb, [RTAB, x##rb]; \
+ orr rnc, rnc, RT1, ror #24; \
+ and RT1, RMASK, rc, lsr#8; \
+ orr rnd, rnd, RT2, ror #16; \
+ and RT2, RMASK, rc, lsr#16; \
+ orr rna, rna, rb, ror #8; \
+ ldrb RT0, [RTAB, xRT0]; \
+ lsr rc, rc, #24; \
+ ldrb RT1, [RTAB, xRT1]; \
+ \
+ orr rnc, rnc, RT0; \
+ ldrb RT2, [RTAB, xRT2]; \
+ and RT0, RMASK, rd; \
+ ldrb rc, [RTAB, x##rc]; \
+ orr rnd, rnd, RT1, ror #24; \
+ and RT1, RMASK, rd, lsr#8; \
+ orr rna, rna, RT2, ror #16; \
+ ldrb RT0, [RTAB, xRT0]; \
+ and RT2, RMASK, rd, lsr#16; \
+ ldrb RT1, [RTAB, xRT1]; \
+ orr rnb, rnb, rc, ror #8; \
+ ldrb RT2, [RTAB, xRT2]; \
+ lsr rd, rd, #24; \
+ ldrb rd, [RTAB, x##rd]; \
+ \
+ orr rnd, rnd, RT0; \
+ orr rna, rna, RT1, ror #24; \
+ orr rnb, rnb, RT2, ror #16; \
+ orr rnc, rnc, rd, ror #8;
+
+#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+ do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+ do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define set_last_round_rmask(_, __) \
+ mov RMASK, #0xff;
+
+#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+ add RTAB, RTAB, #(4 * 256); \
+ do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+ addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.globl _gcry_aes_arm_decrypt_block
+ELF(.type _gcry_aes_arm_decrypt_block,%function;)
+
+_gcry_aes_arm_decrypt_block:
+ /* input:
+ * %x0: keysched, CTX
+ * %x1: dst
+ * %x2: src
+ * %w3: number of rounds.. 10, 12 or 14
+ * %x4: decryption table
+ */
+ CFI_STARTPROC();
+
+ /* read input block */
+
+ /* aligned load */
+ ldp RA, RB, [RSRC];
+ ldp RC, RD, [RSRC, #8];
+#ifndef __AARCH64EL__
+ rev RA, RA;
+ rev RB, RB;
+ rev RC, RC;
+ rev RD, RD;
+#endif
+
+ mov RMASK, #(0xff << 2);
+
+ cmp NROUNDS, #12;
+ bge .Ldec_256;
+
+ firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+.Ldec_tail:
+ decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
+ lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+ /* store output block */
+
+ /* aligned store */
+#ifndef __AARCH64EL__
+ rev RA, RA;
+ rev RB, RB;
+ rev RC, RC;
+ rev RD, RD;
+#endif
+ /* write output block */
+ stp RA, RB, [RDST];
+ stp RC, RD, [RDST, #8];
+
+ mov x0, #(0);
+ ret;
+
+.ltorg
+.Ldec_256:
+ beq .Ldec_192;
+
+ firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+ decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+ decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+ b .Ldec_tail;
+
+.ltorg
+.Ldec_192:
+ firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+ decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+ decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+ b .Ldec_tail;
+ CFI_ENDPROC();
+ELF(.size _gcry_aes_arm_decrypt_block,.-_gcry_aes_arm_decrypt_block;)
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__ */