diff options
Diffstat (limited to 'src/crypto/aes/gcm_arm64.s')
-rw-r--r-- | src/crypto/aes/gcm_arm64.s | 1021 |
1 files changed, 1021 insertions, 0 deletions
diff --git a/src/crypto/aes/gcm_arm64.s b/src/crypto/aes/gcm_arm64.s new file mode 100644 index 0000000..61c868c --- /dev/null +++ b/src/crypto/aes/gcm_arm64.s @@ -0,0 +1,1021 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +#define B0 V0 +#define B1 V1 +#define B2 V2 +#define B3 V3 +#define B4 V4 +#define B5 V5 +#define B6 V6 +#define B7 V7 + +#define ACC0 V8 +#define ACC1 V9 +#define ACCM V10 + +#define T0 V11 +#define T1 V12 +#define T2 V13 +#define T3 V14 + +#define POLY V15 +#define ZERO V16 +#define INC V17 +#define CTR V18 + +#define K0 V19 +#define K1 V20 +#define K2 V21 +#define K3 V22 +#define K4 V23 +#define K5 V24 +#define K6 V25 +#define K7 V26 +#define K8 V27 +#define K9 V28 +#define K10 V29 +#define K11 V30 +#define KLAST V31 + +#define reduce() \ + VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ + VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ + VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ + VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ + VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ + VEOR T0.B16, ACC1.B16, ACC1.B16 \ + VPMULL POLY.D1, ACC0.D1, T0.Q1 \ + VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ + VEOR T0.B16, ACC0.B16, ACC0.B16 \ + VPMULL POLY.D1, ACC0.D1, T0.Q1 \ + VEOR T0.B16, ACC1.B16, ACC1.B16 \ + VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ + VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ + +// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) +TEXT ·gcmAesFinish(SB),NOSPLIT,$0 +#define pTbl R0 +#define tMsk R1 +#define tPtr R2 +#define plen R3 +#define dlen R4 + + MOVD $0xC2, R1 + LSL $56, R1 + MOVD $1, R0 + VMOV R1, POLY.D[0] + VMOV R0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD productTable+0(FP), pTbl + MOVD tagMask+8(FP), tMsk + MOVD T+16(FP), tPtr + MOVD pLen+24(FP), plen + MOVD dLen+32(FP), dlen + + VLD1 (tPtr), [ACC0.B16] + VLD1 (tMsk), [B1.B16] + + LSL $3, plen + LSL $3, dlen + + VMOV dlen, B0.D[0] + VMOV plen, B0.D[1] + + ADD $14*16, pTbl + VLD1.P (pTbl), [T1.B16, T2.B16] + + VEOR ACC0.B16, B0.B16, B0.B16 + + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + reduce() + + VREV64 ACC0.B16, ACC0.B16 + VEOR B1.B16, ACC0.B16, ACC0.B16 + + VST1 [ACC0.B16], (tPtr) + RET +#undef pTbl +#undef tMsk +#undef tPtr +#undef plen +#undef dlen + +// func gcmAesInit(productTable *[256]byte, ks []uint32) +TEXT ·gcmAesInit(SB),NOSPLIT,$0 +#define pTbl R0 +#define KS R1 +#define NR R2 +#define I R3 + MOVD productTable+0(FP), pTbl + MOVD ks_base+8(FP), KS + MOVD ks_len+16(FP), NR + + MOVD $0xC2, I + LSL $56, I + VMOV I, POLY.D[0] + MOVD $1, I + VMOV I, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + // Encrypt block 0 with the AES key to generate the hash key H + VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] + VEOR B0.B16, B0.B16, B0.B16 + AESE T0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T3.B16, B0.B16 + AESMC B0.B16, B0.B16 + VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] + AESE T0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T3.B16, B0.B16 + AESMC B0.B16, B0.B16 + TBZ $4, NR, initEncFinish + VLD1.P 32(KS), [T0.B16, T1.B16] + AESE T0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T1.B16, B0.B16 + AESMC B0.B16, B0.B16 + TBZ $3, NR, initEncFinish + VLD1.P 32(KS), [T0.B16, T1.B16] + AESE T0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T1.B16, B0.B16 + AESMC B0.B16, B0.B16 +initEncFinish: + VLD1 (KS), [T0.B16, T1.B16, T2.B16] + AESE T0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE T1.B16, B0.B16 + VEOR T2.B16, B0.B16, B0.B16 + + VREV64 B0.B16, B0.B16 + + // Multiply by 2 modulo P + VMOV B0.D[0], I + ASR $63, I + VMOV I, T1.D[0] + VMOV I, T1.D[1] + VAND POLY.B16, T1.B16, T1.B16 + VUSHR $63, B0.D2, T2.D2 + VEXT $8, ZERO.B16, T2.B16, T2.B16 + VSHL $1, B0.D2, B0.D2 + VEOR T1.B16, B0.B16, B0.B16 + VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available + + // Karatsuba pre-computation + VEXT $8, B0.B16, B0.B16, B1.B16 + VEOR B0.B16, B1.B16, B1.B16 + + ADD $14*16, pTbl + VST1 [B0.B16, B1.B16], (pTbl) + SUB $2*16, pTbl + + VMOV B0.B16, B2.B16 + VMOV B1.B16, B3.B16 + + MOVD $7, I + +initLoop: + // Compute powers of H + SUBS $1, I + + VPMULL B0.D1, B2.D1, T1.Q1 + VPMULL2 B0.D2, B2.D2, T0.Q1 + VPMULL B1.D1, B3.D1, T2.Q1 + VEOR T0.B16, T2.B16, T2.B16 + VEOR T1.B16, T2.B16, T2.B16 + VEXT $8, ZERO.B16, T2.B16, T3.B16 + VEXT $8, T2.B16, ZERO.B16, T2.B16 + VEOR T2.B16, T0.B16, T0.B16 + VEOR T3.B16, T1.B16, T1.B16 + VPMULL POLY.D1, T0.D1, T2.Q1 + VEXT $8, T0.B16, T0.B16, T0.B16 + VEOR T2.B16, T0.B16, T0.B16 + VPMULL POLY.D1, T0.D1, T2.Q1 + VEXT $8, T0.B16, T0.B16, T0.B16 + VEOR T2.B16, T0.B16, T0.B16 + VEOR T1.B16, T0.B16, B2.B16 + VMOV B2.B16, B3.B16 + VEXT $8, B2.B16, B2.B16, B2.B16 + VEOR B2.B16, B3.B16, B3.B16 + + VST1 [B2.B16, B3.B16], (pTbl) + SUB $2*16, pTbl + + BNE initLoop + RET +#undef I +#undef NR +#undef KS +#undef pTbl + +// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) +TEXT ·gcmAesData(SB),NOSPLIT,$0 +#define pTbl R0 +#define aut R1 +#define tPtr R2 +#define autLen R3 +#define H0 R4 +#define pTblSave R5 + +#define mulRound(X) \ + VLD1.P 32(pTbl), [T1.B16, T2.B16] \ + VREV64 X.B16, X.B16 \ + VEXT $8, X.B16, X.B16, T0.B16 \ + VEOR X.B16, T0.B16, T0.B16 \ + VPMULL X.D1, T1.D1, T3.Q1 \ + VEOR T3.B16, ACC1.B16, ACC1.B16 \ + VPMULL2 X.D2, T1.D2, T3.Q1 \ + VEOR T3.B16, ACC0.B16, ACC0.B16 \ + VPMULL T0.D1, T2.D1, T3.Q1 \ + VEOR T3.B16, ACCM.B16, ACCM.B16 + + MOVD productTable+0(FP), pTbl + MOVD data_base+8(FP), aut + MOVD data_len+16(FP), autLen + MOVD T+32(FP), tPtr + + VEOR ACC0.B16, ACC0.B16, ACC0.B16 + CBZ autLen, dataBail + + MOVD $0xC2, H0 + LSL $56, H0 + VMOV H0, POLY.D[0] + MOVD $1, H0 + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + MOVD pTbl, pTblSave + + CMP $13, autLen + BEQ dataTLS + CMP $128, autLen + BLT startSinglesLoop + B octetsLoop + +dataTLS: + ADD $14*16, pTbl + VLD1.P (pTbl), [T1.B16, T2.B16] + VEOR B0.B16, B0.B16, B0.B16 + + MOVD (aut), H0 + VMOV H0, B0.D[0] + MOVW 8(aut), H0 + VMOV H0, B0.S[2] + MOVB 12(aut), H0 + VMOV H0, B0.B[12] + + MOVD $0, autLen + B dataMul + +octetsLoop: + CMP $128, autLen + BLT startSinglesLoop + SUB $128, autLen + + VLD1.P 32(aut), [B0.B16, B1.B16] + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + mulRound(B1) + VLD1.P 32(aut), [B2.B16, B3.B16] + mulRound(B2) + mulRound(B3) + VLD1.P 32(aut), [B4.B16, B5.B16] + mulRound(B4) + mulRound(B5) + VLD1.P 32(aut), [B6.B16, B7.B16] + mulRound(B6) + mulRound(B7) + + MOVD pTblSave, pTbl + reduce() + B octetsLoop + +startSinglesLoop: + + ADD $14*16, pTbl + VLD1.P (pTbl), [T1.B16, T2.B16] + +singlesLoop: + + CMP $16, autLen + BLT dataEnd + SUB $16, autLen + + VLD1.P 16(aut), [B0.B16] +dataMul: + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + reduce() + + B singlesLoop + +dataEnd: + + CBZ autLen, dataBail + VEOR B0.B16, B0.B16, B0.B16 + ADD autLen, aut + +dataLoadLoop: + MOVB.W -1(aut), H0 + VEXT $15, B0.B16, ZERO.B16, B0.B16 + VMOV H0, B0.B[0] + SUBS $1, autLen + BNE dataLoadLoop + B dataMul + +dataBail: + VST1 [ACC0.B16], (tPtr) + RET + +#undef pTbl +#undef aut +#undef tPtr +#undef autLen +#undef H0 +#undef pTblSave + +// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) +TEXT ·gcmAesEnc(SB),NOSPLIT,$0 +#define pTbl R0 +#define dstPtr R1 +#define ctrPtr R2 +#define srcPtr R3 +#define ks R4 +#define tPtr R5 +#define srcPtrLen R6 +#define aluCTR R7 +#define aluTMP R8 +#define aluK R9 +#define NR R10 +#define H0 R11 +#define H1 R12 +#define curK R13 +#define pTblSave R14 + +#define aesrndx8(K) \ + AESE K.B16, B0.B16 \ + AESMC B0.B16, B0.B16 \ + AESE K.B16, B1.B16 \ + AESMC B1.B16, B1.B16 \ + AESE K.B16, B2.B16 \ + AESMC B2.B16, B2.B16 \ + AESE K.B16, B3.B16 \ + AESMC B3.B16, B3.B16 \ + AESE K.B16, B4.B16 \ + AESMC B4.B16, B4.B16 \ + AESE K.B16, B5.B16 \ + AESMC B5.B16, B5.B16 \ + AESE K.B16, B6.B16 \ + AESMC B6.B16, B6.B16 \ + AESE K.B16, B7.B16 \ + AESMC B7.B16, B7.B16 + +#define aesrndlastx8(K) \ + AESE K.B16, B0.B16 \ + AESE K.B16, B1.B16 \ + AESE K.B16, B2.B16 \ + AESE K.B16, B3.B16 \ + AESE K.B16, B4.B16 \ + AESE K.B16, B5.B16 \ + AESE K.B16, B6.B16 \ + AESE K.B16, B7.B16 + + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD ks_base+72(FP), ks + MOVD ks_len+80(FP), NR + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + // Compute NR from len(ks) + MOVD pTbl, pTblSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + // Skip to <8 blocks loop + CMP $128, srcPtrLen + + MOVD ks, H0 + // For AES-128 round keys are stored in: K0 .. K10, KLAST + VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] + VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] + VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] + VMOV K10.B16, KLAST.B16 + + BLT startSingles + // There are at least 8 blocks to encrypt + TBZ $4, NR, octetsLoop + + // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST + VMOV K8.B16, K10.B16 + VMOV K9.B16, K11.B16 + VMOV KLAST.B16, K8.B16 + VLD1.P 16(H0), [K9.B16] + VLD1.P 16(H0), [KLAST.B16] + TBZ $3, NR, octetsLoop + // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST + VMOV KLAST.B16, K8.B16 + VLD1.P 16(H0), [K9.B16] + VLD1.P 16(H0), [KLAST.B16] + ADD $10*16, ks, H0 + MOVD H0, curK + +octetsLoop: + SUB $128, srcPtrLen + + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + VADD B4.S4, INC.S4, B5.S4 + VREV32 B4.B16, B4.B16 + VADD B5.S4, INC.S4, B6.S4 + VREV32 B5.B16, B5.B16 + VADD B6.S4, INC.S4, B7.S4 + VREV32 B6.B16, B6.B16 + VADD B7.S4, INC.S4, CTR.S4 + VREV32 B7.B16, B7.B16 + + aesrndx8(K0) + aesrndx8(K1) + aesrndx8(K2) + aesrndx8(K3) + aesrndx8(K4) + aesrndx8(K5) + aesrndx8(K6) + aesrndx8(K7) + TBZ $4, NR, octetsFinish + aesrndx8(K10) + aesrndx8(K11) + TBZ $3, NR, octetsFinish + VLD1.P 32(curK), [T1.B16, T2.B16] + aesrndx8(T1) + aesrndx8(T2) + MOVD H0, curK +octetsFinish: + aesrndx8(K8) + aesrndlastx8(K9) + + VEOR KLAST.B16, B0.B16, B0.B16 + VEOR KLAST.B16, B1.B16, B1.B16 + VEOR KLAST.B16, B2.B16, B2.B16 + VEOR KLAST.B16, B3.B16, B3.B16 + VEOR KLAST.B16, B4.B16, B4.B16 + VEOR KLAST.B16, B5.B16, B5.B16 + VEOR KLAST.B16, B6.B16, B6.B16 + VEOR KLAST.B16, B7.B16, B7.B16 + + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B0.B16, T1.B16, B0.B16 + VEOR B1.B16, T2.B16, B1.B16 + VST1.P [B0.B16, B1.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B2.B16, T1.B16, B2.B16 + VEOR B3.B16, T2.B16, B3.B16 + VST1.P [B2.B16, B3.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B4.B16, T1.B16, B4.B16 + VEOR B5.B16, T2.B16, B5.B16 + VST1.P [B4.B16, B5.B16], 32(dstPtr) + VLD1.P 32(srcPtr), [T1.B16, T2.B16] + VEOR B6.B16, T1.B16, B6.B16 + VEOR B7.B16, T2.B16, B7.B16 + VST1.P [B6.B16, B7.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + mulRound(B1) + mulRound(B2) + mulRound(B3) + mulRound(B4) + mulRound(B5) + mulRound(B6) + mulRound(B7) + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE octetsLoop + +startSingles: + CBZ srcPtrLen, done + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + // Preload AES round keys + ADD $128, ks + VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] + VMOV K10.B16, KLAST.B16 + TBZ $4, NR, singlesLoop + VLD1.P 32(ks), [B1.B16, B2.B16] + VMOV B2.B16, KLAST.B16 + TBZ $3, NR, singlesLoop + VLD1.P 32(ks), [B3.B16, B4.B16] + VMOV B4.B16, KLAST.B16 + +singlesLoop: + CMP $16, srcPtrLen + BLT tail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [T0.B16] + VEOR KLAST.B16, T0.B16, T0.B16 + + VREV32 CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + AESE K0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K3.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K4.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K5.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K6.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K7.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K8.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K9.B16, B0.B16 + TBZ $4, NR, singlesLast + AESMC B0.B16, B0.B16 + AESE K10.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B1.B16, B0.B16 + TBZ $3, NR, singlesLast + AESMC B0.B16, B0.B16 + AESE B2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B3.B16, B0.B16 +singlesLast: + VEOR T0.B16, B0.B16, B0.B16 +encReduce: + VST1.P [B0.B16], 16(dstPtr) + + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + + reduce() + + B singlesLoop +tail: + CBZ srcPtrLen, done + + VEOR T0.B16, T0.B16, T0.B16 + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + ADD srcPtrLen, srcPtr + + TBZ $3, srcPtrLen, ld4 + MOVD.W -8(srcPtr), H0 + VMOV H0, T0.D[0] + VMOV H1, T3.D[0] +ld4: + TBZ $2, srcPtrLen, ld2 + MOVW.W -4(srcPtr), H0 + VEXT $12, T0.B16, ZERO.B16, T0.B16 + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.S[0] + VMOV H1, T3.S[0] +ld2: + TBZ $1, srcPtrLen, ld1 + MOVH.W -2(srcPtr), H0 + VEXT $14, T0.B16, ZERO.B16, T0.B16 + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.H[0] + VMOV H1, T3.H[0] +ld1: + TBZ $0, srcPtrLen, ld0 + MOVB.W -1(srcPtr), H0 + VEXT $15, T0.B16, ZERO.B16, T0.B16 + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H0, T0.B[0] + VMOV H1, T3.B[0] +ld0: + + MOVD ZR, srcPtrLen + VEOR KLAST.B16, T0.B16, T0.B16 + VREV32 CTR.B16, B0.B16 + + AESE K0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K3.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K4.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K5.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K6.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K7.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K8.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K9.B16, B0.B16 + TBZ $4, NR, tailLast + AESMC B0.B16, B0.B16 + AESE K10.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B1.B16, B0.B16 + TBZ $3, NR, tailLast + AESMC B0.B16, B0.B16 + AESE B2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B3.B16, B0.B16 + +tailLast: + VEOR T0.B16, B0.B16, B0.B16 + VAND T3.B16, B0.B16, B0.B16 + B encReduce + +done: + VST1 [ACC0.B16], (tPtr) + RET + +// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) +TEXT ·gcmAesDec(SB),NOSPLIT,$0 + MOVD productTable+0(FP), pTbl + MOVD dst+8(FP), dstPtr + MOVD src_base+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD ctr+56(FP), ctrPtr + MOVD T+64(FP), tPtr + MOVD ks_base+72(FP), ks + MOVD ks_len+80(FP), NR + + MOVD $0xC2, H1 + LSL $56, H1 + MOVD $1, H0 + VMOV H1, POLY.D[0] + VMOV H0, POLY.D[1] + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + // Compute NR from len(ks) + MOVD pTbl, pTblSave + // Current tag, after AAD + VLD1 (tPtr), [ACC0.B16] + VEOR ACC1.B16, ACC1.B16, ACC1.B16 + VEOR ACCM.B16, ACCM.B16, ACCM.B16 + // Prepare initial counter, and the increment vector + VLD1 (ctrPtr), [CTR.B16] + VEOR INC.B16, INC.B16, INC.B16 + MOVD $1, H0 + VMOV H0, INC.S[3] + VREV32 CTR.B16, CTR.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + MOVD ks, H0 + // For AES-128 round keys are stored in: K0 .. K10, KLAST + VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] + VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] + VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] + VMOV K10.B16, KLAST.B16 + + // Skip to <8 blocks loop + CMP $128, srcPtrLen + BLT startSingles + // There are at least 8 blocks to encrypt + TBZ $4, NR, octetsLoop + + // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST + VMOV K8.B16, K10.B16 + VMOV K9.B16, K11.B16 + VMOV KLAST.B16, K8.B16 + VLD1.P 16(H0), [K9.B16] + VLD1.P 16(H0), [KLAST.B16] + TBZ $3, NR, octetsLoop + // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST + VMOV KLAST.B16, K8.B16 + VLD1.P 16(H0), [K9.B16] + VLD1.P 16(H0), [KLAST.B16] + ADD $10*16, ks, H0 + MOVD H0, curK + +octetsLoop: + SUB $128, srcPtrLen + + VMOV CTR.B16, B0.B16 + VADD B0.S4, INC.S4, B1.S4 + VREV32 B0.B16, B0.B16 + VADD B1.S4, INC.S4, B2.S4 + VREV32 B1.B16, B1.B16 + VADD B2.S4, INC.S4, B3.S4 + VREV32 B2.B16, B2.B16 + VADD B3.S4, INC.S4, B4.S4 + VREV32 B3.B16, B3.B16 + VADD B4.S4, INC.S4, B5.S4 + VREV32 B4.B16, B4.B16 + VADD B5.S4, INC.S4, B6.S4 + VREV32 B5.B16, B5.B16 + VADD B6.S4, INC.S4, B7.S4 + VREV32 B6.B16, B6.B16 + VADD B7.S4, INC.S4, CTR.S4 + VREV32 B7.B16, B7.B16 + + aesrndx8(K0) + aesrndx8(K1) + aesrndx8(K2) + aesrndx8(K3) + aesrndx8(K4) + aesrndx8(K5) + aesrndx8(K6) + aesrndx8(K7) + TBZ $4, NR, octetsFinish + aesrndx8(K10) + aesrndx8(K11) + TBZ $3, NR, octetsFinish + VLD1.P 32(curK), [T1.B16, T2.B16] + aesrndx8(T1) + aesrndx8(T2) + MOVD H0, curK +octetsFinish: + aesrndx8(K8) + aesrndlastx8(K9) + + VEOR KLAST.B16, B0.B16, T1.B16 + VEOR KLAST.B16, B1.B16, T2.B16 + VEOR KLAST.B16, B2.B16, B2.B16 + VEOR KLAST.B16, B3.B16, B3.B16 + VEOR KLAST.B16, B4.B16, B4.B16 + VEOR KLAST.B16, B5.B16, B5.B16 + VEOR KLAST.B16, B6.B16, B6.B16 + VEOR KLAST.B16, B7.B16, B7.B16 + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B0.B16, T1.B16, T1.B16 + VEOR B1.B16, T2.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + + VLD1.P 32(pTbl), [T1.B16, T2.B16] + VREV64 B0.B16, B0.B16 + VEOR ACC0.B16, B0.B16, B0.B16 + VEXT $8, B0.B16, B0.B16, T0.B16 + VEOR B0.B16, T0.B16, T0.B16 + VPMULL B0.D1, T1.D1, ACC1.Q1 + VPMULL2 B0.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B2.B16, B0.B16, T1.B16 + VEOR B3.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B4.B16, B0.B16, T1.B16 + VEOR B5.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + VLD1.P 32(srcPtr), [B0.B16, B1.B16] + VEOR B6.B16, B0.B16, T1.B16 + VEOR B7.B16, B1.B16, T2.B16 + VST1.P [T1.B16, T2.B16], 32(dstPtr) + mulRound(B0) + mulRound(B1) + + MOVD pTblSave, pTbl + reduce() + + CMP $128, srcPtrLen + BGE octetsLoop + +startSingles: + CBZ srcPtrLen, done + ADD $14*16, pTbl + // Preload H and its Karatsuba precomp + VLD1.P (pTbl), [T1.B16, T2.B16] + // Preload AES round keys + ADD $128, ks + VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] + VMOV K10.B16, KLAST.B16 + TBZ $4, NR, singlesLoop + VLD1.P 32(ks), [B1.B16, B2.B16] + VMOV B2.B16, KLAST.B16 + TBZ $3, NR, singlesLoop + VLD1.P 32(ks), [B3.B16, B4.B16] + VMOV B4.B16, KLAST.B16 + +singlesLoop: + CMP $16, srcPtrLen + BLT tail + SUB $16, srcPtrLen + + VLD1.P 16(srcPtr), [T0.B16] + VREV64 T0.B16, B5.B16 + VEOR KLAST.B16, T0.B16, T0.B16 + + VREV32 CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + AESE K0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K3.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K4.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K5.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K6.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K7.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K8.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K9.B16, B0.B16 + TBZ $4, NR, singlesLast + AESMC B0.B16, B0.B16 + AESE K10.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B1.B16, B0.B16 + TBZ $3, NR, singlesLast + AESMC B0.B16, B0.B16 + AESE B2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B3.B16, B0.B16 +singlesLast: + VEOR T0.B16, B0.B16, B0.B16 + + VST1.P [B0.B16], 16(dstPtr) + + VEOR ACC0.B16, B5.B16, B5.B16 + VEXT $8, B5.B16, B5.B16, T0.B16 + VEOR B5.B16, T0.B16, T0.B16 + VPMULL B5.D1, T1.D1, ACC1.Q1 + VPMULL2 B5.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + reduce() + + B singlesLoop +tail: + CBZ srcPtrLen, done + + VREV32 CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + + AESE K0.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K1.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K3.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K4.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K5.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K6.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K7.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K8.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE K9.B16, B0.B16 + TBZ $4, NR, tailLast + AESMC B0.B16, B0.B16 + AESE K10.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B1.B16, B0.B16 + TBZ $3, NR, tailLast + AESMC B0.B16, B0.B16 + AESE B2.B16, B0.B16 + AESMC B0.B16, B0.B16 + AESE B3.B16, B0.B16 +tailLast: + VEOR KLAST.B16, B0.B16, B0.B16 + + // Assuming it is safe to load past dstPtr due to the presence of the tag + VLD1 (srcPtr), [B5.B16] + + VEOR B5.B16, B0.B16, B0.B16 + + VEOR T3.B16, T3.B16, T3.B16 + MOVD $0, H1 + SUB $1, H1 + + TBZ $3, srcPtrLen, ld4 + VMOV B0.D[0], H0 + MOVD.P H0, 8(dstPtr) + VMOV H1, T3.D[0] + VEXT $8, ZERO.B16, B0.B16, B0.B16 +ld4: + TBZ $2, srcPtrLen, ld2 + VMOV B0.S[0], H0 + MOVW.P H0, 4(dstPtr) + VEXT $12, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.S[0] + VEXT $4, ZERO.B16, B0.B16, B0.B16 +ld2: + TBZ $1, srcPtrLen, ld1 + VMOV B0.H[0], H0 + MOVH.P H0, 2(dstPtr) + VEXT $14, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.H[0] + VEXT $2, ZERO.B16, B0.B16, B0.B16 +ld1: + TBZ $0, srcPtrLen, ld0 + VMOV B0.B[0], H0 + MOVB.P H0, 1(dstPtr) + VEXT $15, T3.B16, ZERO.B16, T3.B16 + VMOV H1, T3.B[0] +ld0: + + VAND T3.B16, B5.B16, B5.B16 + VREV64 B5.B16, B5.B16 + + VEOR ACC0.B16, B5.B16, B5.B16 + VEXT $8, B5.B16, B5.B16, T0.B16 + VEOR B5.B16, T0.B16, T0.B16 + VPMULL B5.D1, T1.D1, ACC1.Q1 + VPMULL2 B5.D2, T1.D2, ACC0.Q1 + VPMULL T0.D1, T2.D1, ACCM.Q1 + reduce() +done: + VST1 [ACC0.B16], (tPtr) + + RET |