summaryrefslogtreecommitdiffstats
path: root/src/crypto/aes/gcm_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/aes/gcm_arm64.s')
-rw-r--r--src/crypto/aes/gcm_arm64.s1021
1 files changed, 1021 insertions, 0 deletions
diff --git a/src/crypto/aes/gcm_arm64.s b/src/crypto/aes/gcm_arm64.s
new file mode 100644
index 0000000..c350102
--- /dev/null
+++ b/src/crypto/aes/gcm_arm64.s
@@ -0,0 +1,1021 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define B0 V0
+#define B1 V1
+#define B2 V2
+#define B3 V3
+#define B4 V4
+#define B5 V5
+#define B6 V6
+#define B7 V7
+
+#define ACC0 V8
+#define ACC1 V9
+#define ACCM V10
+
+#define T0 V11
+#define T1 V12
+#define T2 V13
+#define T3 V14
+
+#define POLY V15
+#define ZERO V16
+#define INC V17
+#define CTR V18
+
+#define K0 V19
+#define K1 V20
+#define K2 V21
+#define K3 V22
+#define K4 V23
+#define K5 V24
+#define K6 V25
+#define K7 V26
+#define K8 V27
+#define K9 V28
+#define K10 V29
+#define K11 V30
+#define KLAST V31
+
+#define reduce() \
+ VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
+ VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
+ VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
+ VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
+ VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
+ VEOR T0.B16, ACC1.B16, ACC1.B16 \
+ VPMULL POLY.D1, ACC0.D1, T0.Q1 \
+ VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
+ VEOR T0.B16, ACC0.B16, ACC0.B16 \
+ VPMULL POLY.D1, ACC0.D1, T0.Q1 \
+ VEOR T0.B16, ACC1.B16, ACC1.B16 \
+ VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
+ VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
+
+// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+TEXT ·gcmAesFinish(SB),NOSPLIT,$0
+#define pTbl R0
+#define tMsk R1
+#define tPtr R2
+#define plen R3
+#define dlen R4
+
+ MOVD $0xC2, R1
+ LSL $56, R1
+ MOVD $1, R0
+ VMOV R1, POLY.D[0]
+ VMOV R0, POLY.D[1]
+ VEOR ZERO.B16, ZERO.B16, ZERO.B16
+
+ MOVD productTable+0(FP), pTbl
+ MOVD tagMask+8(FP), tMsk
+ MOVD T+16(FP), tPtr
+ MOVD pLen+24(FP), plen
+ MOVD dLen+32(FP), dlen
+
+ VLD1 (tPtr), [ACC0.B16]
+ VLD1 (tMsk), [B1.B16]
+
+ LSL $3, plen
+ LSL $3, dlen
+
+ VMOV dlen, B0.D[0]
+ VMOV plen, B0.D[1]
+
+ ADD $14*16, pTbl
+ VLD1.P (pTbl), [T1.B16, T2.B16]
+
+ VEOR ACC0.B16, B0.B16, B0.B16
+
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+
+ reduce()
+
+ VREV64 ACC0.B16, ACC0.B16
+ VEOR B1.B16, ACC0.B16, ACC0.B16
+
+ VST1 [ACC0.B16], (tPtr)
+ RET
+#undef pTbl
+#undef tMsk
+#undef tPtr
+#undef plen
+#undef dlen
+
+// func gcmAesInit(productTable *[256]byte, ks []uint32)
+TEXT ·gcmAesInit(SB),NOSPLIT,$0
+#define pTbl R0
+#define KS R1
+#define NR R2
+#define I R3
+ MOVD productTable+0(FP), pTbl
+ MOVD ks_base+8(FP), KS
+ MOVD ks_len+16(FP), NR
+
+ MOVD $0xC2, I
+ LSL $56, I
+ VMOV I, POLY.D[0]
+ MOVD $1, I
+ VMOV I, POLY.D[1]
+ VEOR ZERO.B16, ZERO.B16, ZERO.B16
+
+ // Encrypt block 0 with the AES key to generate the hash key H
+ VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
+ VEOR B0.B16, B0.B16, B0.B16
+ AESE T0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
+ AESE T0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ TBZ $4, NR, initEncFinish
+ VLD1.P 32(KS), [T0.B16, T1.B16]
+ AESE T0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ TBZ $3, NR, initEncFinish
+ VLD1.P 32(KS), [T0.B16, T1.B16]
+ AESE T0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+initEncFinish:
+ VLD1 (KS), [T0.B16, T1.B16, T2.B16]
+ AESE T0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE T1.B16, B0.B16
+ VEOR T2.B16, B0.B16, B0.B16
+
+ VREV64 B0.B16, B0.B16
+
+ // Multiply by 2 modulo P
+ VMOV B0.D[0], I
+ ASR $63, I
+ VMOV I, T1.D[0]
+ VMOV I, T1.D[1]
+ VAND POLY.B16, T1.B16, T1.B16
+ VUSHR $63, B0.D2, T2.D2
+ VEXT $8, ZERO.B16, T2.B16, T2.B16
+ VSHL $1, B0.D2, B0.D2
+ VEOR T1.B16, B0.B16, B0.B16
+ VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
+
+ // Karatsuba pre-computation
+ VEXT $8, B0.B16, B0.B16, B1.B16
+ VEOR B0.B16, B1.B16, B1.B16
+
+ ADD $14*16, pTbl
+ VST1 [B0.B16, B1.B16], (pTbl)
+ SUB $2*16, pTbl
+
+ VMOV B0.B16, B2.B16
+ VMOV B1.B16, B3.B16
+
+ MOVD $7, I
+
+initLoop:
+ // Compute powers of H
+ SUBS $1, I
+
+ VPMULL B0.D1, B2.D1, T1.Q1
+ VPMULL2 B0.D2, B2.D2, T0.Q1
+ VPMULL B1.D1, B3.D1, T2.Q1
+ VEOR T0.B16, T2.B16, T2.B16
+ VEOR T1.B16, T2.B16, T2.B16
+ VEXT $8, ZERO.B16, T2.B16, T3.B16
+ VEXT $8, T2.B16, ZERO.B16, T2.B16
+ VEOR T2.B16, T0.B16, T0.B16
+ VEOR T3.B16, T1.B16, T1.B16
+ VPMULL POLY.D1, T0.D1, T2.Q1
+ VEXT $8, T0.B16, T0.B16, T0.B16
+ VEOR T2.B16, T0.B16, T0.B16
+ VPMULL POLY.D1, T0.D1, T2.Q1
+ VEXT $8, T0.B16, T0.B16, T0.B16
+ VEOR T2.B16, T0.B16, T0.B16
+ VEOR T1.B16, T0.B16, B2.B16
+ VMOV B2.B16, B3.B16
+ VEXT $8, B2.B16, B2.B16, B2.B16
+ VEOR B2.B16, B3.B16, B3.B16
+
+ VST1 [B2.B16, B3.B16], (pTbl)
+ SUB $2*16, pTbl
+
+ BNE initLoop
+ RET
+#undef I
+#undef NR
+#undef KS
+#undef pTbl
+
+// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
+TEXT ·gcmAesData(SB),NOSPLIT,$0
+#define pTbl R0
+#define aut R1
+#define tPtr R2
+#define autLen R3
+#define H0 R4
+#define pTblSave R5
+
+#define mulRound(X) \
+ VLD1.P 32(pTbl), [T1.B16, T2.B16] \
+ VREV64 X.B16, X.B16 \
+ VEXT $8, X.B16, X.B16, T0.B16 \
+ VEOR X.B16, T0.B16, T0.B16 \
+ VPMULL X.D1, T1.D1, T3.Q1 \
+ VEOR T3.B16, ACC1.B16, ACC1.B16 \
+ VPMULL2 X.D2, T1.D2, T3.Q1 \
+ VEOR T3.B16, ACC0.B16, ACC0.B16 \
+ VPMULL T0.D1, T2.D1, T3.Q1 \
+ VEOR T3.B16, ACCM.B16, ACCM.B16
+
+ MOVD productTable+0(FP), pTbl
+ MOVD data_base+8(FP), aut
+ MOVD data_len+16(FP), autLen
+ MOVD T+32(FP), tPtr
+
+ VEOR ACC0.B16, ACC0.B16, ACC0.B16
+ CBZ autLen, dataBail
+
+ MOVD $0xC2, H0
+ LSL $56, H0
+ VMOV H0, POLY.D[0]
+ MOVD $1, H0
+ VMOV H0, POLY.D[1]
+ VEOR ZERO.B16, ZERO.B16, ZERO.B16
+ MOVD pTbl, pTblSave
+
+ CMP $13, autLen
+ BEQ dataTLS
+ CMP $128, autLen
+ BLT startSinglesLoop
+ B octetsLoop
+
+dataTLS:
+ ADD $14*16, pTbl
+ VLD1.P (pTbl), [T1.B16, T2.B16]
+ VEOR B0.B16, B0.B16, B0.B16
+
+ MOVD (aut), H0
+ VMOV H0, B0.D[0]
+ MOVW 8(aut), H0
+ VMOV H0, B0.S[2]
+ MOVB 12(aut), H0
+ VMOV H0, B0.B[12]
+
+ MOVD $0, autLen
+ B dataMul
+
+octetsLoop:
+ CMP $128, autLen
+ BLT startSinglesLoop
+ SUB $128, autLen
+
+ VLD1.P 32(aut), [B0.B16, B1.B16]
+
+ VLD1.P 32(pTbl), [T1.B16, T2.B16]
+ VREV64 B0.B16, B0.B16
+ VEOR ACC0.B16, B0.B16, B0.B16
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+
+ mulRound(B1)
+ VLD1.P 32(aut), [B2.B16, B3.B16]
+ mulRound(B2)
+ mulRound(B3)
+ VLD1.P 32(aut), [B4.B16, B5.B16]
+ mulRound(B4)
+ mulRound(B5)
+ VLD1.P 32(aut), [B6.B16, B7.B16]
+ mulRound(B6)
+ mulRound(B7)
+
+ MOVD pTblSave, pTbl
+ reduce()
+ B octetsLoop
+
+startSinglesLoop:
+
+ ADD $14*16, pTbl
+ VLD1.P (pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+
+ CMP $16, autLen
+ BLT dataEnd
+ SUB $16, autLen
+
+ VLD1.P 16(aut), [B0.B16]
+dataMul:
+ VREV64 B0.B16, B0.B16
+ VEOR ACC0.B16, B0.B16, B0.B16
+
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+
+ reduce()
+
+ B singlesLoop
+
+dataEnd:
+
+ CBZ autLen, dataBail
+ VEOR B0.B16, B0.B16, B0.B16
+ ADD autLen, aut
+
+dataLoadLoop:
+ MOVB.W -1(aut), H0
+ VEXT $15, B0.B16, ZERO.B16, B0.B16
+ VMOV H0, B0.B[0]
+ SUBS $1, autLen
+ BNE dataLoadLoop
+ B dataMul
+
+dataBail:
+ VST1 [ACC0.B16], (tPtr)
+ RET
+
+#undef pTbl
+#undef aut
+#undef tPtr
+#undef autLen
+#undef H0
+#undef pTblSave
+
+// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesEnc(SB),NOSPLIT,$0
+#define pTbl R0
+#define dstPtr R1
+#define ctrPtr R2
+#define srcPtr R3
+#define ks R4
+#define tPtr R5
+#define srcPtrLen R6
+#define aluCTR R7
+#define aluTMP R8
+#define aluK R9
+#define NR R10
+#define H0 R11
+#define H1 R12
+#define curK R13
+#define pTblSave R14
+
+#define aesrndx8(K) \
+ AESE K.B16, B0.B16 \
+ AESMC B0.B16, B0.B16 \
+ AESE K.B16, B1.B16 \
+ AESMC B1.B16, B1.B16 \
+ AESE K.B16, B2.B16 \
+ AESMC B2.B16, B2.B16 \
+ AESE K.B16, B3.B16 \
+ AESMC B3.B16, B3.B16 \
+ AESE K.B16, B4.B16 \
+ AESMC B4.B16, B4.B16 \
+ AESE K.B16, B5.B16 \
+ AESMC B5.B16, B5.B16 \
+ AESE K.B16, B6.B16 \
+ AESMC B6.B16, B6.B16 \
+ AESE K.B16, B7.B16 \
+ AESMC B7.B16, B7.B16
+
+#define aesrndlastx8(K) \
+ AESE K.B16, B0.B16 \
+ AESE K.B16, B1.B16 \
+ AESE K.B16, B2.B16 \
+ AESE K.B16, B3.B16 \
+ AESE K.B16, B4.B16 \
+ AESE K.B16, B5.B16 \
+ AESE K.B16, B6.B16 \
+ AESE K.B16, B7.B16
+
+ MOVD productTable+0(FP), pTbl
+ MOVD dst+8(FP), dstPtr
+ MOVD src_base+32(FP), srcPtr
+ MOVD src_len+40(FP), srcPtrLen
+ MOVD ctr+56(FP), ctrPtr
+ MOVD T+64(FP), tPtr
+ MOVD ks_base+72(FP), ks
+ MOVD ks_len+80(FP), NR
+
+ MOVD $0xC2, H1
+ LSL $56, H1
+ MOVD $1, H0
+ VMOV H1, POLY.D[0]
+ VMOV H0, POLY.D[1]
+ VEOR ZERO.B16, ZERO.B16, ZERO.B16
+ // Compute NR from len(ks)
+ MOVD pTbl, pTblSave
+ // Current tag, after AAD
+ VLD1 (tPtr), [ACC0.B16]
+ VEOR ACC1.B16, ACC1.B16, ACC1.B16
+ VEOR ACCM.B16, ACCM.B16, ACCM.B16
+ // Prepare initial counter, and the increment vector
+ VLD1 (ctrPtr), [CTR.B16]
+ VEOR INC.B16, INC.B16, INC.B16
+ MOVD $1, H0
+ VMOV H0, INC.S[3]
+ VREV32 CTR.B16, CTR.B16
+ VADD CTR.S4, INC.S4, CTR.S4
+ // Skip to <8 blocks loop
+ CMP $128, srcPtrLen
+
+ MOVD ks, H0
+ // For AES-128 round keys are stored in: K0 .. K10, KLAST
+ VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
+ VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
+ VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
+ VMOV K10.B16, KLAST.B16
+
+ BLT startSingles
+ // There are at least 8 blocks to encrypt
+ TBZ $4, NR, octetsLoop
+
+ // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
+ VMOV K8.B16, K10.B16
+ VMOV K9.B16, K11.B16
+ VMOV KLAST.B16, K8.B16
+ VLD1.P 16(H0), [K9.B16]
+ VLD1.P 16(H0), [KLAST.B16]
+ TBZ $3, NR, octetsLoop
+ // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
+ VMOV KLAST.B16, K8.B16
+ VLD1.P 16(H0), [K9.B16]
+ VLD1.P 16(H0), [KLAST.B16]
+ ADD $10*16, ks, H0
+ MOVD H0, curK
+
+octetsLoop:
+ SUB $128, srcPtrLen
+
+ VMOV CTR.B16, B0.B16
+ VADD B0.S4, INC.S4, B1.S4
+ VREV32 B0.B16, B0.B16
+ VADD B1.S4, INC.S4, B2.S4
+ VREV32 B1.B16, B1.B16
+ VADD B2.S4, INC.S4, B3.S4
+ VREV32 B2.B16, B2.B16
+ VADD B3.S4, INC.S4, B4.S4
+ VREV32 B3.B16, B3.B16
+ VADD B4.S4, INC.S4, B5.S4
+ VREV32 B4.B16, B4.B16
+ VADD B5.S4, INC.S4, B6.S4
+ VREV32 B5.B16, B5.B16
+ VADD B6.S4, INC.S4, B7.S4
+ VREV32 B6.B16, B6.B16
+ VADD B7.S4, INC.S4, CTR.S4
+ VREV32 B7.B16, B7.B16
+
+ aesrndx8(K0)
+ aesrndx8(K1)
+ aesrndx8(K2)
+ aesrndx8(K3)
+ aesrndx8(K4)
+ aesrndx8(K5)
+ aesrndx8(K6)
+ aesrndx8(K7)
+ TBZ $4, NR, octetsFinish
+ aesrndx8(K10)
+ aesrndx8(K11)
+ TBZ $3, NR, octetsFinish
+ VLD1.P 32(curK), [T1.B16, T2.B16]
+ aesrndx8(T1)
+ aesrndx8(T2)
+ MOVD H0, curK
+octetsFinish:
+ aesrndx8(K8)
+ aesrndlastx8(K9)
+
+ VEOR KLAST.B16, B0.B16, B0.B16
+ VEOR KLAST.B16, B1.B16, B1.B16
+ VEOR KLAST.B16, B2.B16, B2.B16
+ VEOR KLAST.B16, B3.B16, B3.B16
+ VEOR KLAST.B16, B4.B16, B4.B16
+ VEOR KLAST.B16, B5.B16, B5.B16
+ VEOR KLAST.B16, B6.B16, B6.B16
+ VEOR KLAST.B16, B7.B16, B7.B16
+
+ VLD1.P 32(srcPtr), [T1.B16, T2.B16]
+ VEOR B0.B16, T1.B16, B0.B16
+ VEOR B1.B16, T2.B16, B1.B16
+ VST1.P [B0.B16, B1.B16], 32(dstPtr)
+ VLD1.P 32(srcPtr), [T1.B16, T2.B16]
+ VEOR B2.B16, T1.B16, B2.B16
+ VEOR B3.B16, T2.B16, B3.B16
+ VST1.P [B2.B16, B3.B16], 32(dstPtr)
+ VLD1.P 32(srcPtr), [T1.B16, T2.B16]
+ VEOR B4.B16, T1.B16, B4.B16
+ VEOR B5.B16, T2.B16, B5.B16
+ VST1.P [B4.B16, B5.B16], 32(dstPtr)
+ VLD1.P 32(srcPtr), [T1.B16, T2.B16]
+ VEOR B6.B16, T1.B16, B6.B16
+ VEOR B7.B16, T2.B16, B7.B16
+ VST1.P [B6.B16, B7.B16], 32(dstPtr)
+
+ VLD1.P 32(pTbl), [T1.B16, T2.B16]
+ VREV64 B0.B16, B0.B16
+ VEOR ACC0.B16, B0.B16, B0.B16
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+
+ mulRound(B1)
+ mulRound(B2)
+ mulRound(B3)
+ mulRound(B4)
+ mulRound(B5)
+ mulRound(B6)
+ mulRound(B7)
+ MOVD pTblSave, pTbl
+ reduce()
+
+ CMP $128, srcPtrLen
+ BGE octetsLoop
+
+startSingles:
+ CBZ srcPtrLen, done
+ ADD $14*16, pTbl
+ // Preload H and its Karatsuba precomp
+ VLD1.P (pTbl), [T1.B16, T2.B16]
+ // Preload AES round keys
+ ADD $128, ks
+ VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
+ VMOV K10.B16, KLAST.B16
+ TBZ $4, NR, singlesLoop
+ VLD1.P 32(ks), [B1.B16, B2.B16]
+ VMOV B2.B16, KLAST.B16
+ TBZ $3, NR, singlesLoop
+ VLD1.P 32(ks), [B3.B16, B4.B16]
+ VMOV B4.B16, KLAST.B16
+
+singlesLoop:
+ CMP $16, srcPtrLen
+ BLT tail
+ SUB $16, srcPtrLen
+
+ VLD1.P 16(srcPtr), [T0.B16]
+ VEOR KLAST.B16, T0.B16, T0.B16
+
+ VREV32 CTR.B16, B0.B16
+ VADD CTR.S4, INC.S4, CTR.S4
+
+ AESE K0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K4.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K5.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K6.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K7.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K8.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K9.B16, B0.B16
+ TBZ $4, NR, singlesLast
+ AESMC B0.B16, B0.B16
+ AESE K10.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B1.B16, B0.B16
+ TBZ $3, NR, singlesLast
+ AESMC B0.B16, B0.B16
+ AESE B2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B3.B16, B0.B16
+singlesLast:
+ VEOR T0.B16, B0.B16, B0.B16
+encReduce:
+ VST1.P [B0.B16], 16(dstPtr)
+
+ VREV64 B0.B16, B0.B16
+ VEOR ACC0.B16, B0.B16, B0.B16
+
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+
+ reduce()
+
+ B singlesLoop
+tail:
+ CBZ srcPtrLen, done
+
+ VEOR T0.B16, T0.B16, T0.B16
+ VEOR T3.B16, T3.B16, T3.B16
+ MOVD $0, H1
+ SUB $1, H1
+ ADD srcPtrLen, srcPtr
+
+ TBZ $3, srcPtrLen, ld4
+ MOVD.W -8(srcPtr), H0
+ VMOV H0, T0.D[0]
+ VMOV H1, T3.D[0]
+ld4:
+ TBZ $2, srcPtrLen, ld2
+ MOVW.W -4(srcPtr), H0
+ VEXT $12, T0.B16, ZERO.B16, T0.B16
+ VEXT $12, T3.B16, ZERO.B16, T3.B16
+ VMOV H0, T0.S[0]
+ VMOV H1, T3.S[0]
+ld2:
+ TBZ $1, srcPtrLen, ld1
+ MOVH.W -2(srcPtr), H0
+ VEXT $14, T0.B16, ZERO.B16, T0.B16
+ VEXT $14, T3.B16, ZERO.B16, T3.B16
+ VMOV H0, T0.H[0]
+ VMOV H1, T3.H[0]
+ld1:
+ TBZ $0, srcPtrLen, ld0
+ MOVB.W -1(srcPtr), H0
+ VEXT $15, T0.B16, ZERO.B16, T0.B16
+ VEXT $15, T3.B16, ZERO.B16, T3.B16
+ VMOV H0, T0.B[0]
+ VMOV H1, T3.B[0]
+ld0:
+
+ MOVD ZR, srcPtrLen
+ VEOR KLAST.B16, T0.B16, T0.B16
+ VREV32 CTR.B16, B0.B16
+
+ AESE K0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K4.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K5.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K6.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K7.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K8.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K9.B16, B0.B16
+ TBZ $4, NR, tailLast
+ AESMC B0.B16, B0.B16
+ AESE K10.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B1.B16, B0.B16
+ TBZ $3, NR, tailLast
+ AESMC B0.B16, B0.B16
+ AESE B2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B3.B16, B0.B16
+
+tailLast:
+ VEOR T0.B16, B0.B16, B0.B16
+ VAND T3.B16, B0.B16, B0.B16
+ B encReduce
+
+done:
+ VST1 [ACC0.B16], (tPtr)
+ RET
+
+// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesDec(SB),NOSPLIT,$0
+ MOVD productTable+0(FP), pTbl
+ MOVD dst+8(FP), dstPtr
+ MOVD src_base+32(FP), srcPtr
+ MOVD src_len+40(FP), srcPtrLen
+ MOVD ctr+56(FP), ctrPtr
+ MOVD T+64(FP), tPtr
+ MOVD ks_base+72(FP), ks
+ MOVD ks_len+80(FP), NR
+
+ MOVD $0xC2, H1
+ LSL $56, H1
+ MOVD $1, H0
+ VMOV H1, POLY.D[0]
+ VMOV H0, POLY.D[1]
+ VEOR ZERO.B16, ZERO.B16, ZERO.B16
+ // Compute NR from len(ks)
+ MOVD pTbl, pTblSave
+ // Current tag, after AAD
+ VLD1 (tPtr), [ACC0.B16]
+ VEOR ACC1.B16, ACC1.B16, ACC1.B16
+ VEOR ACCM.B16, ACCM.B16, ACCM.B16
+ // Prepare initial counter, and the increment vector
+ VLD1 (ctrPtr), [CTR.B16]
+ VEOR INC.B16, INC.B16, INC.B16
+ MOVD $1, H0
+ VMOV H0, INC.S[3]
+ VREV32 CTR.B16, CTR.B16
+ VADD CTR.S4, INC.S4, CTR.S4
+
+ MOVD ks, H0
+ // For AES-128 round keys are stored in: K0 .. K10, KLAST
+ VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
+ VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
+ VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
+ VMOV K10.B16, KLAST.B16
+
+ // Skip to <8 blocks loop
+ CMP $128, srcPtrLen
+ BLT startSingles
+ // There are at least 8 blocks to encrypt
+ TBZ $4, NR, octetsLoop
+
+ // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
+ VMOV K8.B16, K10.B16
+ VMOV K9.B16, K11.B16
+ VMOV KLAST.B16, K8.B16
+ VLD1.P 16(H0), [K9.B16]
+ VLD1.P 16(H0), [KLAST.B16]
+ TBZ $3, NR, octetsLoop
+ // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
+ VMOV KLAST.B16, K8.B16
+ VLD1.P 16(H0), [K9.B16]
+ VLD1.P 16(H0), [KLAST.B16]
+ ADD $10*16, ks, H0
+ MOVD H0, curK
+
+octetsLoop:
+ SUB $128, srcPtrLen
+
+ VMOV CTR.B16, B0.B16
+ VADD B0.S4, INC.S4, B1.S4
+ VREV32 B0.B16, B0.B16
+ VADD B1.S4, INC.S4, B2.S4
+ VREV32 B1.B16, B1.B16
+ VADD B2.S4, INC.S4, B3.S4
+ VREV32 B2.B16, B2.B16
+ VADD B3.S4, INC.S4, B4.S4
+ VREV32 B3.B16, B3.B16
+ VADD B4.S4, INC.S4, B5.S4
+ VREV32 B4.B16, B4.B16
+ VADD B5.S4, INC.S4, B6.S4
+ VREV32 B5.B16, B5.B16
+ VADD B6.S4, INC.S4, B7.S4
+ VREV32 B6.B16, B6.B16
+ VADD B7.S4, INC.S4, CTR.S4
+ VREV32 B7.B16, B7.B16
+
+ aesrndx8(K0)
+ aesrndx8(K1)
+ aesrndx8(K2)
+ aesrndx8(K3)
+ aesrndx8(K4)
+ aesrndx8(K5)
+ aesrndx8(K6)
+ aesrndx8(K7)
+ TBZ $4, NR, octetsFinish
+ aesrndx8(K10)
+ aesrndx8(K11)
+ TBZ $3, NR, octetsFinish
+ VLD1.P 32(curK), [T1.B16, T2.B16]
+ aesrndx8(T1)
+ aesrndx8(T2)
+ MOVD H0, curK
+octetsFinish:
+ aesrndx8(K8)
+ aesrndlastx8(K9)
+
+ VEOR KLAST.B16, B0.B16, T1.B16
+ VEOR KLAST.B16, B1.B16, T2.B16
+ VEOR KLAST.B16, B2.B16, B2.B16
+ VEOR KLAST.B16, B3.B16, B3.B16
+ VEOR KLAST.B16, B4.B16, B4.B16
+ VEOR KLAST.B16, B5.B16, B5.B16
+ VEOR KLAST.B16, B6.B16, B6.B16
+ VEOR KLAST.B16, B7.B16, B7.B16
+
+ VLD1.P 32(srcPtr), [B0.B16, B1.B16]
+ VEOR B0.B16, T1.B16, T1.B16
+ VEOR B1.B16, T2.B16, T2.B16
+ VST1.P [T1.B16, T2.B16], 32(dstPtr)
+
+ VLD1.P 32(pTbl), [T1.B16, T2.B16]
+ VREV64 B0.B16, B0.B16
+ VEOR ACC0.B16, B0.B16, B0.B16
+ VEXT $8, B0.B16, B0.B16, T0.B16
+ VEOR B0.B16, T0.B16, T0.B16
+ VPMULL B0.D1, T1.D1, ACC1.Q1
+ VPMULL2 B0.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+ mulRound(B1)
+
+ VLD1.P 32(srcPtr), [B0.B16, B1.B16]
+ VEOR B2.B16, B0.B16, T1.B16
+ VEOR B3.B16, B1.B16, T2.B16
+ VST1.P [T1.B16, T2.B16], 32(dstPtr)
+ mulRound(B0)
+ mulRound(B1)
+
+ VLD1.P 32(srcPtr), [B0.B16, B1.B16]
+ VEOR B4.B16, B0.B16, T1.B16
+ VEOR B5.B16, B1.B16, T2.B16
+ VST1.P [T1.B16, T2.B16], 32(dstPtr)
+ mulRound(B0)
+ mulRound(B1)
+
+ VLD1.P 32(srcPtr), [B0.B16, B1.B16]
+ VEOR B6.B16, B0.B16, T1.B16
+ VEOR B7.B16, B1.B16, T2.B16
+ VST1.P [T1.B16, T2.B16], 32(dstPtr)
+ mulRound(B0)
+ mulRound(B1)
+
+ MOVD pTblSave, pTbl
+ reduce()
+
+ CMP $128, srcPtrLen
+ BGE octetsLoop
+
+startSingles:
+ CBZ srcPtrLen, done
+ ADD $14*16, pTbl
+ // Preload H and its Karatsuba precomp
+ VLD1.P (pTbl), [T1.B16, T2.B16]
+ // Preload AES round keys
+ ADD $128, ks
+ VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
+ VMOV K10.B16, KLAST.B16
+ TBZ $4, NR, singlesLoop
+ VLD1.P 32(ks), [B1.B16, B2.B16]
+ VMOV B2.B16, KLAST.B16
+ TBZ $3, NR, singlesLoop
+ VLD1.P 32(ks), [B3.B16, B4.B16]
+ VMOV B4.B16, KLAST.B16
+
+singlesLoop:
+ CMP $16, srcPtrLen
+ BLT tail
+ SUB $16, srcPtrLen
+
+ VLD1.P 16(srcPtr), [T0.B16]
+ VREV64 T0.B16, B5.B16
+ VEOR KLAST.B16, T0.B16, T0.B16
+
+ VREV32 CTR.B16, B0.B16
+ VADD CTR.S4, INC.S4, CTR.S4
+
+ AESE K0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K4.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K5.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K6.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K7.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K8.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K9.B16, B0.B16
+ TBZ $4, NR, singlesLast
+ AESMC B0.B16, B0.B16
+ AESE K10.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B1.B16, B0.B16
+ TBZ $3, NR, singlesLast
+ AESMC B0.B16, B0.B16
+ AESE B2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B3.B16, B0.B16
+singlesLast:
+ VEOR T0.B16, B0.B16, B0.B16
+
+ VST1.P [B0.B16], 16(dstPtr)
+
+ VEOR ACC0.B16, B5.B16, B5.B16
+ VEXT $8, B5.B16, B5.B16, T0.B16
+ VEOR B5.B16, T0.B16, T0.B16
+ VPMULL B5.D1, T1.D1, ACC1.Q1
+ VPMULL2 B5.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+ reduce()
+
+ B singlesLoop
+tail:
+ CBZ srcPtrLen, done
+
+ VREV32 CTR.B16, B0.B16
+ VADD CTR.S4, INC.S4, CTR.S4
+
+ AESE K0.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K1.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K3.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K4.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K5.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K6.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K7.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K8.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE K9.B16, B0.B16
+ TBZ $4, NR, tailLast
+ AESMC B0.B16, B0.B16
+ AESE K10.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B1.B16, B0.B16
+ TBZ $3, NR, tailLast
+ AESMC B0.B16, B0.B16
+ AESE B2.B16, B0.B16
+ AESMC B0.B16, B0.B16
+ AESE B3.B16, B0.B16
+tailLast:
+ VEOR KLAST.B16, B0.B16, B0.B16
+
+ // Assuming it is safe to load past dstPtr due to the presence of the tag
+ VLD1 (srcPtr), [B5.B16]
+
+ VEOR B5.B16, B0.B16, B0.B16
+
+ VEOR T3.B16, T3.B16, T3.B16
+ MOVD $0, H1
+ SUB $1, H1
+
+ TBZ $3, srcPtrLen, ld4
+ VMOV B0.D[0], H0
+ MOVD.P H0, 8(dstPtr)
+ VMOV H1, T3.D[0]
+ VEXT $8, ZERO.B16, B0.B16, B0.B16
+ld4:
+ TBZ $2, srcPtrLen, ld2
+ VMOV B0.S[0], H0
+ MOVW.P H0, 4(dstPtr)
+ VEXT $12, T3.B16, ZERO.B16, T3.B16
+ VMOV H1, T3.S[0]
+ VEXT $4, ZERO.B16, B0.B16, B0.B16
+ld2:
+ TBZ $1, srcPtrLen, ld1
+ VMOV B0.H[0], H0
+ MOVH.P H0, 2(dstPtr)
+ VEXT $14, T3.B16, ZERO.B16, T3.B16
+ VMOV H1, T3.H[0]
+ VEXT $2, ZERO.B16, B0.B16, B0.B16
+ld1:
+ TBZ $0, srcPtrLen, ld0
+ VMOV B0.B[0], H0
+ MOVB.P H0, 1(dstPtr)
+ VEXT $15, T3.B16, ZERO.B16, T3.B16
+ VMOV H1, T3.B[0]
+ld0:
+
+ VAND T3.B16, B5.B16, B5.B16
+ VREV64 B5.B16, B5.B16
+
+ VEOR ACC0.B16, B5.B16, B5.B16
+ VEXT $8, B5.B16, B5.B16, T0.B16
+ VEOR B5.B16, T0.B16, T0.B16
+ VPMULL B5.D1, T1.D1, ACC1.Q1
+ VPMULL2 B5.D2, T1.D2, ACC0.Q1
+ VPMULL T0.D1, T2.D1, ACCM.Q1
+ reduce()
+done:
+ VST1 [ACC0.B16], (tPtr)
+
+ RET