summaryrefslogtreecommitdiffstats
path: root/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
diff options
context:
space:
mode:
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S')
-rw-r--r--comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S433
1 files changed, 433 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
new file mode 100644
index 0000000000..1de66a1626
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
@@ -0,0 +1,433 @@
+/* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+
+.syntax unified
+.arch armv8-a
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+# define GET_DATA_POINTER(reg, name, rtmp) \
+ ldr reg, 1f; \
+ ldr rtmp, 2f; \
+ b 3f; \
+ 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+ 2: .word name(GOT); \
+ 3: add reg, pc, reg; \
+ ldr reg, [reg, rtmp];
+#else
+# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+.align 4
+gcry_gcm_reduction_constant:
+.Lrconst64:
+ .quad 0xc200000000000000
+
+
+/* Register macros */
+
+#define rhash q0
+#define rhash_l d0
+#define rhash_h d1
+
+#define rh1 q1
+#define rh1_l d2
+#define rh1_h d3
+
+#define rbuf q2
+#define rbuf_l d4
+#define rbuf_h d5
+
+#define rbuf1 q3
+#define rbuf1_l d6
+#define rbuf1_h d7
+
+#define rbuf2 q4
+#define rbuf2_l d8
+#define rbuf2_h d9
+
+#define rbuf3 q5
+#define rbuf3_l d10
+#define rbuf3_h d11
+
+#define rh2 q6
+#define rh2_l d12
+#define rh2_h d13
+
+#define rh3 q7
+#define rh3_l d14
+#define rh3_h d15
+
+#define rh4 q8
+#define rh4_l d16
+#define rh4_h d17
+
+#define rr2 q9
+#define rr2_l d18
+#define rr2_h d19
+
+#define rr3 q10
+#define rr3_l d20
+#define rr3_h d21
+
+#define rr0 q11
+#define rr0_l d22
+#define rr0_h d23
+
+#define rr1 q12
+#define rr1_l d24
+#define rr1_h d25
+
+#define rt0 q13
+#define rt0_l d26
+#define rt0_h d27
+
+#define rt1 q14
+#define rt1_l d28
+#define rt1_h d29
+
+#define rrconst q15
+#define rrconst_l d30
+#define rrconst_h d31
+
+/* GHASH macros */
+
+/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+
+/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
+ * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
+ */
+#define PMUL_128x128(r0, r1, a, b, t, interleave_op) \
+ veor t##_h, b##_l, b##_h; \
+ veor t##_l, a##_l, a##_h; \
+ vmull.p64 r0, a##_l, b##_l; \
+ vmull.p64 r1, a##_h, b##_h; \
+ vmull.p64 t, t##_h, t##_l; \
+ interleave_op; \
+ veor t, r0; \
+ veor t, r1; \
+ veor r0##_h, t##_l; \
+ veor r1##_l, t##_h;
+
+/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
+ * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'.
+ * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
+ * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'.
+ */
+#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \
+ veor tA##_h, bA##_l, bA##_h; \
+ veor tA##_l, aA##_l, aA##_h; \
+ veor tB##_h, bB##_l, bB##_h; \
+ veor tB##_l, aB##_l, aB##_h; \
+ vmull.p64 r0A, aA##_l, bA##_l; \
+ vmull.p64 r1A, aA##_h, bA##_h; \
+ vmull.p64 tA, tA##_h, tA##_l; \
+ vmull.p64 r0B, aB##_l, bB##_l; \
+ vmull.p64 r1B, aB##_h, bB##_h; \
+ vmull.p64 tB, tB##_h, tB##_l; \
+ interleave_op; \
+ veor tA, r0A; \
+ veor tA, r1A; \
+ veor tB, r0B; \
+ veor tB, r1B; \
+ veor r0A##_h, tA##_l; \
+ veor r1A##_l, tA##_h; \
+ veor r0B##_h, tB##_l; \
+ veor r1B##_l, tB##_h; \
+
+/* Input: 'r0:r1', Output: 'a' */
+#define REDUCTION(a, r0, r1, rconst, t, interleave_op) \
+ vmull.p64 t, r0##_l, rconst; \
+ veor r0##_h, t##_l; \
+ veor r1##_l, t##_h; \
+ interleave_op; \
+ vmull.p64 t, r0##_h, rconst; \
+ veor r1, t; \
+ veor a, r0, r1;
+
+#define _(...) __VA_ARGS__
+#define __ _()
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+ * const byte *buf, size_t nblocks,
+ * void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_armv8_ce_pmull
+.type _gcry_ghash_armv8_ce_pmull,%function;
+_gcry_ghash_armv8_ce_pmull:
+ /* input:
+ * r0: gcm_key
+ * r1: result/hash
+ * r2: buf
+ * r3: nblocks
+ * %st+0: gcm_table
+ */
+ push {r4-r6, lr}
+
+ cmp r3, #0
+ beq .Ldo_nothing
+
+ GET_DATA_POINTER(r4, .Lrconst64, lr)
+
+ vld1.64 {rhash}, [r1]
+ vld1.64 {rh1}, [r0]
+
+ vrev64.8 rhash, rhash /* byte-swap */
+ vld1.64 {rrconst_h}, [r4]
+ vext.8 rhash, rhash, rhash, #8
+
+ cmp r3, #4
+ blo .Less_than_4
+
+ /* Bulk processing of 4 blocks per loop iteration. */
+
+ ldr r5, [sp, #(4*4)];
+ add r6, r5, #32
+
+ vpush {q4-q7}
+
+ vld1.64 {rh2-rh3}, [r5]
+ vld1.64 {rh4}, [r6]
+
+ vld1.64 {rbuf-rbuf1}, [r2]!
+ sub r3, r3, #4
+ vld1.64 {rbuf2-rbuf3}, [r2]!
+
+ cmp r3, #4
+ vrev64.8 rbuf, rbuf /* byte-swap */
+ vrev64.8 rbuf1, rbuf1 /* byte-swap */
+ vrev64.8 rbuf2, rbuf2 /* byte-swap */
+ vrev64.8 rbuf3, rbuf3 /* byte-swap */
+
+ vext.8 rbuf, rbuf, rbuf, #8
+ vext.8 rbuf1, rbuf1, rbuf1, #8
+ vext.8 rbuf2, rbuf2, rbuf2, #8
+ vext.8 rbuf3, rbuf3, rbuf3, #8
+ veor rhash, rhash, rbuf /* in0 ^ hash */
+
+ blo .Lend_4
+
+.Loop_4:
+ /* (in0 ^ hash) * H⁴ => rr2:rr3 */
+ /* (in1) * H³ => rr0:rr1 */
+ PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+ vld1.64 {rbuf-rbuf1}, [r2]!
+ sub r3, r3, #4
+ veor rr0, rr0, rr2
+ veor rr1, rr1, rr3
+
+ /* (in2) * H² => rr2:rr3 */
+ /* (in3) * H¹ => rhash:rbuf3 */
+ PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1,
+ _(vrev64.8 rbuf, rbuf))
+
+ vld1.64 {rbuf2}, [r2]!
+
+ vrev64.8 rbuf1, rbuf1
+ veor rr0, rr0, rr2
+ veor rr1, rr1, rr3
+
+ cmp r3, #4
+ vext.8 rbuf, rbuf, rbuf, #8
+ vext.8 rbuf1, rbuf1, rbuf1, #8
+
+ veor rr0, rr0, rhash
+ veor rr1, rr1, rbuf3
+
+ vld1.64 {rbuf3}, [r2]!
+
+ REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
+ _(vrev64.8 rbuf2, rbuf2;
+ vrev64.8 rbuf3, rbuf3))
+
+ vext.8 rbuf2, rbuf2, rbuf2, #8
+ vext.8 rbuf3, rbuf3, rbuf3, #8
+ veor rhash, rhash, rbuf /* in0 ^ hash */
+
+ bhs .Loop_4
+
+.Lend_4:
+ /* (in0 ^ hash) * H⁴ => rr2:rr3 */
+ /* (in1) * H³ => rr0:rr1 */
+ PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+ /* (in2) * H² => rhash:rbuf */
+ /* (in3) * H¹ => rbuf1:rbuf2 */
+ PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
+ _(veor rr0, rr0, rr2;
+ veor rr1, rr1, rr3))
+
+ veor rr0, rr0, rhash
+ veor rr1, rr1, rbuf
+
+ veor rr0, rr0, rbuf1
+ veor rr1, rr1, rbuf2
+
+ REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
+ _(CLEAR_REG(rr2);
+ CLEAR_REG(rr3);
+ CLEAR_REG(rbuf1);
+ CLEAR_REG(rbuf2);
+ CLEAR_REG(rbuf3);
+ CLEAR_REG(rh2);
+ CLEAR_REG(rh3);
+ CLEAR_REG(rh4)))
+
+ vpop {q4-q7}
+
+ cmp r3, #0
+ beq .Ldone
+
+.Less_than_4:
+ /* Handle remaining blocks. */
+
+ vld1.64 {rbuf}, [r2]!
+ subs r3, r3, #1
+
+ vrev64.8 rbuf, rbuf /* byte-swap */
+ vext.8 rbuf, rbuf, rbuf, #8
+
+ veor rhash, rhash, rbuf
+
+ beq .Lend
+
+.Loop:
+ vld1.64 {rbuf}, [r2]!
+ subs r3, r3, #1
+ PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf))
+ REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
+ veor rhash, rhash, rbuf
+
+ bne .Loop
+
+.Lend:
+ PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
+ REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
+
+.Ldone:
+ CLEAR_REG(rr1)
+ vrev64.8 rhash, rhash /* byte-swap */
+ CLEAR_REG(rt0)
+ CLEAR_REG(rr0)
+ vext.8 rhash, rhash, rhash, #8
+ CLEAR_REG(rt1)
+ vst1.64 {rhash}, [r1]
+ CLEAR_REG(rhash)
+
+.Ldo_nothing:
+ mov r0, #0
+ pop {r4-r6, pc}
+.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
+
+
+/*
+ * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_setup_armv8_ce_pmull
+.type _gcry_ghash_setup_armv8_ce_pmull,%function;
+_gcry_ghash_setup_armv8_ce_pmull:
+ /* input:
+ * r0: gcm_key
+ * r1: gcm_table
+ */
+
+ vpush {q4-q7}
+
+ GET_DATA_POINTER(r2, .Lrconst64, r3)
+
+ vld1.64 {rrconst_h}, [r2]
+
+#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
+ /* H <<< 1 */ \
+ vshr.s64 ma, ib, #63; \
+ vshr.u64 oa, ib, #63; \
+ vshr.u64 ob, ia, #63; \
+ vand ma, const_d; \
+ vshl.u64 ib, ib, #1; \
+ vshl.u64 ia, ia, #1; \
+ vorr ob, ib; \
+ vorr oa, ia; \
+ veor ob, ma; \
+ vst1.64 {oa, ob}, [r_out]
+
+ vld1.64 {rhash}, [r0]
+ vrev64.8 rhash, rhash /* byte-swap */
+ vext.8 rhash, rhash, rhash, #8
+
+ vmov rbuf1, rhash
+ GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
+
+ /* H² */
+ PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __)
+ REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __)
+ vmov rhash, rh2
+ GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */
+ add r1, r1, #16
+
+ /* H³ */
+ PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __)
+ REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __)
+
+ /* H⁴ */
+ PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __)
+ REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __)
+
+ GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */
+ add r1, r1, #16
+ GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */
+
+ CLEAR_REG(rt0)
+ CLEAR_REG(rt1)
+ CLEAR_REG(rr1)
+ CLEAR_REG(rr0)
+ CLEAR_REG(rh1)
+ CLEAR_REG(rh2)
+ CLEAR_REG(rh3)
+ CLEAR_REG(rh4)
+ CLEAR_REG(rhash)
+ CLEAR_REG(rbuf1)
+ CLEAR_REG(rrconst)
+ vpop {q4-q7}
+ bx lr
+.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;
+
+#endif