diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/arm64/crypto/ghash-ce-core.S | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 578 |
1 files changed, 578 insertions, 0 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000..1b319b716 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -0,0 +1,578 @@ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 + + XL2 .req v8 + XM2 .req v9 + XH2 .req v10 + XL3 .req v11 + XM3 .req v12 + XH3 .req v13 + TT3 .req v14 + TT4 .req v15 + HH .req v16 + HH3 .req v17 + HH4 .req v18 + HH34 .req v19 + + .text + .arch armv8-a+crypto + + .macro __pmull_p64, rd, rn, rm + pmull \rd\().1q, \rn\().1d, \rm\().1d + .endm + + .macro __pmull2_p64, rd, rn, rm + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endm + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p64 + add x8, x3, #16 + ld1 {HH.2d-HH4.2d}, [x8] + + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d + eor SHASH2.16b, SHASH2.16b, T1.16b + + trn1 HH34.2d, HH3.2d, HH4.2d + trn2 T1.2d, HH3.2d, HH4.2d + eor HH34.16b, HH34.16b, T1.16b + + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + .endm + + .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + pmull T2.1q, XL.1d, MASK.1d + eor XM.16b, XM.16b, T1.16b + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + .macro __pmull_ghash, pn + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] + + __pmull_pre_\pn + + /* do the head block first, if supplied */ + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 3f + +0: .ifc \pn, p64 + tbnz w0, #0, 2f // skip until #blocks is a + tbnz w0, #1, 2f // round multiple of 4 + +1: ld1 {XM3.16b-TT4.16b}, [x2], #64 + + sub w0, w0, #4 + + rev64 T1.16b, XM3.16b + rev64 T2.16b, XH3.16b + rev64 TT4.16b, TT4.16b + rev64 TT3.16b, TT3.16b + + ext IN1.16b, TT4.16b, TT4.16b, #8 + ext XL3.16b, TT3.16b, TT3.16b, #8 + + eor TT4.16b, TT4.16b, IN1.16b + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) + + eor TT3.16b, TT3.16b, XL3.16b + pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 + pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 + pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) + + ext IN1.16b, T2.16b, T2.16b, #8 + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + eor T2.16b, T2.16b, IN1.16b + pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 + pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 + pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) + + eor XL2.16b, XL2.16b, XL3.16b + eor XH2.16b, XH2.16b, XH3.16b + eor XM2.16b, XM2.16b, XM3.16b + + ext IN1.16b, T1.16b, T1.16b, #8 + ext TT3.16b, XL.16b, XL.16b, #8 + eor XL.16b, XL.16b, IN1.16b + eor T1.16b, T1.16b, TT3.16b + + pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, HH4.1d, XL.1d // a0 * b0 + pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) + + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + eor XM.16b, XM.16b, XM2.16b + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p64 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbz w0, 5f + b 1b + .endif + +2: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 + +3: /* multiply XL by SHASH in GF(2^128) */ +CPU_LE( rev64 T1.16b, T1.16b ) + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + __pmull2_\pn XH, XL, SHASH // a1 * b1 + eor T1.16b, T1.16b, XL.16b + __pmull_\pn XL, XL, SHASH // a0 * b0 + __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + +4: eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_\pn + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbnz w0, 0b + +5: st1 {XL.2d}, [x1] + ret + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + __pmull_ghash p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + __pmull_ghash p8 +ENDPROC(pmull_ghash_update_p8) + + KS0 .req v12 + KS1 .req v13 + INP0 .req v14 + INP1 .req v15 + + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] + .endm + + .macro enc_round, state, key + aese \state\().16b, \key\().16b + aesmc \state\().16b, \state\().16b + .endm + + .macro enc_block, state, rounds + cmp \rounds, #12 + b.lo 2222f /* 128 bits */ + b.eq 1111f /* 192 bits */ + enc_round \state, v17 + enc_round \state, v18 +1111: enc_round \state, v19 + enc_round \state, v20 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + enc_round \state, \key + .endr + aese \state\().16b, v30.16b + eor \state\().16b, \state\().16b, v31.16b + .endm + + .macro pmull_gcm_do_crypt, enc + ld1 {SHASH.2d}, [x4], #16 + ld1 {HH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter + + movi MASK.16b, #0xe1 + trn1 SHASH2.2d, SHASH.2d, HH.2d + trn2 T1.2d, SHASH.2d, HH.2d +CPU_LE( rev x8, x8 ) + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, T1.16b + + .if \enc == 1 + ldr x10, [sp] + ld1 {KS0.16b-KS1.16b}, [x10] + .endif + + cbnz x6, 4f + +0: ld1 {INP0.16b-INP1.16b}, [x3], #32 + + rev x9, x8 + add x11, x8, #1 + add x8, x8, #2 + + .if \enc == 1 + eor INP0.16b, INP0.16b, KS0.16b // encrypt input + eor INP1.16b, INP1.16b, KS1.16b + .endif + + ld1 {KS0.8b}, [x5] // load upper counter + rev x11, x11 + sub w0, w0, #2 + mov KS1.8b, KS0.8b + ins KS0.d[1], x9 // set lower counter + ins KS1.d[1], x11 + + rev64 T1.16b, INP1.16b + + cmp w7, #12 + b.ge 2f // AES-192/256? + +1: enc_round KS0, v21 + ext IN1.16b, T1.16b, T1.16b, #8 + + enc_round KS1, v21 + pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 + + enc_round KS0, v22 + eor T1.16b, T1.16b, IN1.16b + + enc_round KS1, v22 + pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 + + enc_round KS0, v23 + pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + + enc_round KS1, v23 + rev64 T1.16b, INP0.16b + ext T2.16b, XL.16b, XL.16b, #8 + + enc_round KS0, v24 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + + enc_round KS1, v24 + eor XL.16b, XL.16b, IN1.16b + + enc_round KS0, v25 + eor T1.16b, T1.16b, XL.16b + + enc_round KS1, v25 + pmull2 XH.1q, HH.2d, XL.2d // a1 * b1 + + enc_round KS0, v26 + pmull XL.1q, HH.1d, XL.1d // a0 * b0 + + enc_round KS1, v26 + pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0) + + enc_round KS0, v27 + eor XL.16b, XL.16b, XL2.16b + eor XH.16b, XH.16b, XH2.16b + + enc_round KS1, v27 + eor XM.16b, XM.16b, XM2.16b + ext T1.16b, XL.16b, XH.16b, #8 + + enc_round KS0, v28 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + + enc_round KS1, v28 + eor XM.16b, XM.16b, T2.16b + + enc_round KS0, v29 + pmull T2.1q, XL.1d, MASK.1d + + enc_round KS1, v29 + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + aese KS0.16b, v30.16b + eor XL.16b, XM.16b, T2.16b + + aese KS1.16b, v30.16b + ext T2.16b, XL.16b, XL.16b, #8 + + eor KS0.16b, KS0.16b, v31.16b + pmull XL.1q, XL.1d, MASK.1d + eor T2.16b, T2.16b, XH.16b + + eor KS1.16b, KS1.16b, v31.16b + eor XL.16b, XL.16b, T2.16b + + .if \enc == 0 + eor INP0.16b, INP0.16b, KS0.16b + eor INP1.16b, INP1.16b, KS1.16b + .endif + + st1 {INP0.16b-INP1.16b}, [x2], #32 + + cbnz w0, 0b + +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter + + .if \enc == 1 + st1 {KS0.16b-KS1.16b}, [x10] + .endif + + ret + +2: b.eq 3f // AES-192? + enc_round KS0, v17 + enc_round KS1, v17 + enc_round KS0, v18 + enc_round KS1, v18 +3: enc_round KS0, v19 + enc_round KS1, v19 + enc_round KS0, v20 + enc_round KS1, v20 + b 1b + +4: load_round_keys w7, x6 + b 0b + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds, u8 ks[]) + */ +ENTRY(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +ENDPROC(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds) + */ +ENTRY(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +ENDPROC(pmull_gcm_decrypt) + + /* + * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) + */ +ENTRY(pmull_gcm_encrypt_block) + cbz x2, 0f + load_round_keys w3, x2 +0: ld1 {v0.16b}, [x1] + enc_block v0, w3 + st1 {v0.16b}, [x0] + ret +ENDPROC(pmull_gcm_encrypt_block) |