summaryrefslogtreecommitdiffstats
path: root/src/crypto/aes/gcm_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/aes/gcm_amd64.s')
-rw-r--r--src/crypto/aes/gcm_amd64.s1286
1 files changed, 1286 insertions, 0 deletions
diff --git a/src/crypto/aes/gcm_amd64.s b/src/crypto/aes/gcm_amd64.s
new file mode 100644
index 0000000..e6eedf3
--- /dev/null
+++ b/src/crypto/aes/gcm_amd64.s
@@ -0,0 +1,1286 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
+// The implementation uses some optimization as described in:
+// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
+// Instruction and its Usage for Computing the GCM Mode rev. 2.02
+// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
+// Hardware
+
+#include "textflag.h"
+
+#define B0 X0
+#define B1 X1
+#define B2 X2
+#define B3 X3
+#define B4 X4
+#define B5 X5
+#define B6 X6
+#define B7 X7
+
+#define ACC0 X8
+#define ACC1 X9
+#define ACCM X10
+
+#define T0 X11
+#define T1 X12
+#define T2 X13
+#define POLY X14
+#define BSWAP X15
+
+DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
+DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
+
+DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
+DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
+
+DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
+DATA andMask<>+0x08(SB)/8, $0x0000000000000000
+DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
+DATA andMask<>+0x18(SB)/8, $0x0000000000000000
+DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
+DATA andMask<>+0x28(SB)/8, $0x0000000000000000
+DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
+DATA andMask<>+0x38(SB)/8, $0x0000000000000000
+DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
+DATA andMask<>+0x48(SB)/8, $0x0000000000000000
+DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+0x58(SB)/8, $0x0000000000000000
+DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
+DATA andMask<>+0x68(SB)/8, $0x0000000000000000
+DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x78(SB)/8, $0x0000000000000000
+DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
+DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
+DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
+DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
+DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
+DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
+DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
+
+GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
+GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
+GLOBL andMask<>(SB), (NOPTR+RODATA), $240
+
+// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+TEXT ·gcmAesFinish(SB),NOSPLIT,$0
+#define pTbl DI
+#define tMsk SI
+#define tPtr DX
+#define plen AX
+#define dlen CX
+
+ MOVQ productTable+0(FP), pTbl
+ MOVQ tagMask+8(FP), tMsk
+ MOVQ T+16(FP), tPtr
+ MOVQ pLen+24(FP), plen
+ MOVQ dLen+32(FP), dlen
+
+ MOVOU (tPtr), ACC0
+ MOVOU (tMsk), T2
+
+ MOVOU bswapMask<>(SB), BSWAP
+ MOVOU gcmPoly<>(SB), POLY
+
+ SHLQ $3, plen
+ SHLQ $3, dlen
+
+ MOVQ plen, B0
+ PINSRQ $1, dlen, B0
+
+ PXOR ACC0, B0
+
+ MOVOU (16*14)(pTbl), ACC0
+ MOVOU (16*15)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ MOVOU POLY, T0
+ PCLMULQDQ $0x01, ACC0, T0
+ PSHUFD $78, ACC0, ACC0
+ PXOR T0, ACC0
+
+ MOVOU POLY, T0
+ PCLMULQDQ $0x01, ACC0, T0
+ PSHUFD $78, ACC0, ACC0
+ PXOR T0, ACC0
+
+ PXOR ACC1, ACC0
+
+ PSHUFB BSWAP, ACC0
+ PXOR T2, ACC0
+ MOVOU ACC0, (tPtr)
+
+ RET
+#undef pTbl
+#undef tMsk
+#undef tPtr
+#undef plen
+#undef dlen
+
+// func gcmAesInit(productTable *[256]byte, ks []uint32)
+TEXT ·gcmAesInit(SB),NOSPLIT,$0
+#define dst DI
+#define KS SI
+#define NR DX
+
+ MOVQ productTable+0(FP), dst
+ MOVQ ks_base+8(FP), KS
+ MOVQ ks_len+16(FP), NR
+
+ SHRQ $2, NR
+ DECQ NR
+
+ MOVOU bswapMask<>(SB), BSWAP
+ MOVOU gcmPoly<>(SB), POLY
+
+ // Encrypt block 0, with the AES key to generate the hash key H
+ MOVOU (16*0)(KS), B0
+ MOVOU (16*1)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*2)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*3)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*4)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*5)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*6)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*7)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*8)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*9)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*10)(KS), T0
+ CMPQ NR, $12
+ JB initEncLast
+ AESENC T0, B0
+ MOVOU (16*11)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*12)(KS), T0
+ JE initEncLast
+ AESENC T0, B0
+ MOVOU (16*13)(KS), T0
+ AESENC T0, B0
+ MOVOU (16*14)(KS), T0
+initEncLast:
+ AESENCLAST T0, B0
+
+ PSHUFB BSWAP, B0
+ // H * 2
+ PSHUFD $0xff, B0, T0
+ MOVOU B0, T1
+ PSRAL $31, T0
+ PAND POLY, T0
+ PSRLL $31, T1
+ PSLLDQ $4, T1
+ PSLLL $1, B0
+ PXOR T0, B0
+ PXOR T1, B0
+ // Karatsuba pre-computations
+ MOVOU B0, (16*14)(dst)
+ PSHUFD $78, B0, B1
+ PXOR B0, B1
+ MOVOU B1, (16*15)(dst)
+
+ MOVOU B0, B2
+ MOVOU B1, B3
+ // Now prepare powers of H and pre-computations for them
+ MOVQ $7, AX
+
+initLoop:
+ MOVOU B2, T0
+ MOVOU B2, T1
+ MOVOU B3, T2
+ PCLMULQDQ $0x00, B0, T0
+ PCLMULQDQ $0x11, B0, T1
+ PCLMULQDQ $0x00, B1, T2
+
+ PXOR T0, T2
+ PXOR T1, T2
+ MOVOU T2, B4
+ PSLLDQ $8, B4
+ PSRLDQ $8, T2
+ PXOR B4, T0
+ PXOR T2, T1
+
+ MOVOU POLY, B2
+ PCLMULQDQ $0x01, T0, B2
+ PSHUFD $78, T0, T0
+ PXOR B2, T0
+ MOVOU POLY, B2
+ PCLMULQDQ $0x01, T0, B2
+ PSHUFD $78, T0, T0
+ PXOR T0, B2
+ PXOR T1, B2
+
+ MOVOU B2, (16*12)(dst)
+ PSHUFD $78, B2, B3
+ PXOR B2, B3
+ MOVOU B3, (16*13)(dst)
+
+ DECQ AX
+ LEAQ (-16*2)(dst), dst
+ JNE initLoop
+
+ RET
+#undef NR
+#undef KS
+#undef dst
+
+// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
+TEXT ·gcmAesData(SB),NOSPLIT,$0
+#define pTbl DI
+#define aut SI
+#define tPtr CX
+#define autLen DX
+
+#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
+#define mulRoundAAD(X ,i) \
+ MOVOU (16*(i*2))(pTbl), T1;\
+ MOVOU T1, T2;\
+ PCLMULQDQ $0x00, X, T1;\
+ PXOR T1, ACC0;\
+ PCLMULQDQ $0x11, X, T2;\
+ PXOR T2, ACC1;\
+ PSHUFD $78, X, T1;\
+ PXOR T1, X;\
+ MOVOU (16*(i*2+1))(pTbl), T1;\
+ PCLMULQDQ $0x00, X, T1;\
+ PXOR T1, ACCM
+
+ MOVQ productTable+0(FP), pTbl
+ MOVQ data_base+8(FP), aut
+ MOVQ data_len+16(FP), autLen
+ MOVQ T+32(FP), tPtr
+
+ PXOR ACC0, ACC0
+ MOVOU bswapMask<>(SB), BSWAP
+ MOVOU gcmPoly<>(SB), POLY
+
+ TESTQ autLen, autLen
+ JEQ dataBail
+
+ CMPQ autLen, $13 // optimize the TLS case
+ JE dataTLS
+ CMPQ autLen, $128
+ JB startSinglesLoop
+ JMP dataOctaLoop
+
+dataTLS:
+ MOVOU (16*14)(pTbl), T1
+ MOVOU (16*15)(pTbl), T2
+ PXOR B0, B0
+ MOVQ (aut), B0
+ PINSRD $2, 8(aut), B0
+ PINSRB $12, 12(aut), B0
+ XORQ autLen, autLen
+ JMP dataMul
+
+dataOctaLoop:
+ CMPQ autLen, $128
+ JB startSinglesLoop
+ SUBQ $128, autLen
+
+ MOVOU (16*0)(aut), X0
+ MOVOU (16*1)(aut), X1
+ MOVOU (16*2)(aut), X2
+ MOVOU (16*3)(aut), X3
+ MOVOU (16*4)(aut), X4
+ MOVOU (16*5)(aut), X5
+ MOVOU (16*6)(aut), X6
+ MOVOU (16*7)(aut), X7
+ LEAQ (16*8)(aut), aut
+ PSHUFB BSWAP, X0
+ PSHUFB BSWAP, X1
+ PSHUFB BSWAP, X2
+ PSHUFB BSWAP, X3
+ PSHUFB BSWAP, X4
+ PSHUFB BSWAP, X5
+ PSHUFB BSWAP, X6
+ PSHUFB BSWAP, X7
+ PXOR ACC0, X0
+
+ MOVOU (16*0)(pTbl), ACC0
+ MOVOU (16*1)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+ PSHUFD $78, X0, T1
+ PXOR X0, T1
+ PCLMULQDQ $0x00, X0, ACC0
+ PCLMULQDQ $0x11, X0, ACC1
+ PCLMULQDQ $0x00, T1, ACCM
+
+ mulRoundAAD(X1, 1)
+ mulRoundAAD(X2, 2)
+ mulRoundAAD(X3, 3)
+ mulRoundAAD(X4, 4)
+ mulRoundAAD(X5, 5)
+ mulRoundAAD(X6, 6)
+ mulRoundAAD(X7, 7)
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+ JMP dataOctaLoop
+
+startSinglesLoop:
+ MOVOU (16*14)(pTbl), T1
+ MOVOU (16*15)(pTbl), T2
+
+dataSinglesLoop:
+
+ CMPQ autLen, $16
+ JB dataEnd
+ SUBQ $16, autLen
+
+ MOVOU (aut), B0
+dataMul:
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+
+ MOVOU T1, ACC0
+ MOVOU T2, ACCM
+ MOVOU T1, ACC1
+
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ MOVOU POLY, T0
+ PCLMULQDQ $0x01, ACC0, T0
+ PSHUFD $78, ACC0, ACC0
+ PXOR T0, ACC0
+
+ MOVOU POLY, T0
+ PCLMULQDQ $0x01, ACC0, T0
+ PSHUFD $78, ACC0, ACC0
+ PXOR T0, ACC0
+ PXOR ACC1, ACC0
+
+ LEAQ 16(aut), aut
+
+ JMP dataSinglesLoop
+
+dataEnd:
+
+ TESTQ autLen, autLen
+ JEQ dataBail
+
+ PXOR B0, B0
+ LEAQ -1(aut)(autLen*1), aut
+
+dataLoadLoop:
+
+ PSLLDQ $1, B0
+ PINSRB $0, (aut), B0
+
+ LEAQ -1(aut), aut
+ DECQ autLen
+ JNE dataLoadLoop
+
+ JMP dataMul
+
+dataBail:
+ MOVOU ACC0, (tPtr)
+ RET
+#undef pTbl
+#undef aut
+#undef tPtr
+#undef autLen
+
+// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesEnc(SB),0,$256-96
+#define pTbl DI
+#define ctx DX
+#define ctrPtr CX
+#define ptx SI
+#define ks AX
+#define tPtr R8
+#define ptxLen R9
+#define aluCTR R10
+#define aluTMP R11
+#define aluK R12
+#define NR R13
+
+#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
+#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
+#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
+#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
+#define combinedRound(i) \
+ MOVOU (16*i)(ks), T0;\
+ AESENC T0, B0;\
+ AESENC T0, B1;\
+ AESENC T0, B2;\
+ AESENC T0, B3;\
+ MOVOU (16*(i*2))(pTbl), T1;\
+ MOVOU T1, T2;\
+ AESENC T0, B4;\
+ AESENC T0, B5;\
+ AESENC T0, B6;\
+ AESENC T0, B7;\
+ MOVOU (16*i)(SP), T0;\
+ PCLMULQDQ $0x00, T0, T1;\
+ PXOR T1, ACC0;\
+ PSHUFD $78, T0, T1;\
+ PCLMULQDQ $0x11, T0, T2;\
+ PXOR T1, T0;\
+ PXOR T2, ACC1;\
+ MOVOU (16*(i*2+1))(pTbl), T2;\
+ PCLMULQDQ $0x00, T2, T0;\
+ PXOR T0, ACCM
+#define mulRound(i) \
+ MOVOU (16*i)(SP), T0;\
+ MOVOU (16*(i*2))(pTbl), T1;\
+ MOVOU T1, T2;\
+ PCLMULQDQ $0x00, T0, T1;\
+ PXOR T1, ACC0;\
+ PCLMULQDQ $0x11, T0, T2;\
+ PXOR T2, ACC1;\
+ PSHUFD $78, T0, T1;\
+ PXOR T1, T0;\
+ MOVOU (16*(i*2+1))(pTbl), T1;\
+ PCLMULQDQ $0x00, T0, T1;\
+ PXOR T1, ACCM
+
+ MOVQ productTable+0(FP), pTbl
+ MOVQ dst+8(FP), ctx
+ MOVQ src_base+32(FP), ptx
+ MOVQ src_len+40(FP), ptxLen
+ MOVQ ctr+56(FP), ctrPtr
+ MOVQ T+64(FP), tPtr
+ MOVQ ks_base+72(FP), ks
+ MOVQ ks_len+80(FP), NR
+
+ SHRQ $2, NR
+ DECQ NR
+
+ MOVOU bswapMask<>(SB), BSWAP
+ MOVOU gcmPoly<>(SB), POLY
+
+ MOVOU (tPtr), ACC0
+ PXOR ACC1, ACC1
+ PXOR ACCM, ACCM
+ MOVOU (ctrPtr), B0
+ MOVL (3*4)(ctrPtr), aluCTR
+ MOVOU (ks), T0
+ MOVL (3*4)(ks), aluK
+ BSWAPL aluCTR
+ BSWAPL aluK
+
+ PXOR B0, T0
+ MOVOU T0, (8*16 + 0*16)(SP)
+ increment(0)
+
+ CMPQ ptxLen, $128
+ JB gcmAesEncSingles
+ SUBQ $128, ptxLen
+
+ // We have at least 8 blocks to encrypt, prepare the rest of the counters
+ MOVOU T0, (8*16 + 1*16)(SP)
+ increment(1)
+ MOVOU T0, (8*16 + 2*16)(SP)
+ increment(2)
+ MOVOU T0, (8*16 + 3*16)(SP)
+ increment(3)
+ MOVOU T0, (8*16 + 4*16)(SP)
+ increment(4)
+ MOVOU T0, (8*16 + 5*16)(SP)
+ increment(5)
+ MOVOU T0, (8*16 + 6*16)(SP)
+ increment(6)
+ MOVOU T0, (8*16 + 7*16)(SP)
+ increment(7)
+
+ MOVOU (8*16 + 0*16)(SP), B0
+ MOVOU (8*16 + 1*16)(SP), B1
+ MOVOU (8*16 + 2*16)(SP), B2
+ MOVOU (8*16 + 3*16)(SP), B3
+ MOVOU (8*16 + 4*16)(SP), B4
+ MOVOU (8*16 + 5*16)(SP), B5
+ MOVOU (8*16 + 6*16)(SP), B6
+ MOVOU (8*16 + 7*16)(SP), B7
+
+ aesRound(1)
+ increment(0)
+ aesRound(2)
+ increment(1)
+ aesRound(3)
+ increment(2)
+ aesRound(4)
+ increment(3)
+ aesRound(5)
+ increment(4)
+ aesRound(6)
+ increment(5)
+ aesRound(7)
+ increment(6)
+ aesRound(8)
+ increment(7)
+ aesRound(9)
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB encLast1
+ aesRnd(T0)
+ aesRound(11)
+ MOVOU (16*12)(ks), T0
+ JE encLast1
+ aesRnd(T0)
+ aesRound(13)
+ MOVOU (16*14)(ks), T0
+encLast1:
+ aesRndLast(T0)
+
+ MOVOU (16*0)(ptx), T0
+ PXOR T0, B0
+ MOVOU (16*1)(ptx), T0
+ PXOR T0, B1
+ MOVOU (16*2)(ptx), T0
+ PXOR T0, B2
+ MOVOU (16*3)(ptx), T0
+ PXOR T0, B3
+ MOVOU (16*4)(ptx), T0
+ PXOR T0, B4
+ MOVOU (16*5)(ptx), T0
+ PXOR T0, B5
+ MOVOU (16*6)(ptx), T0
+ PXOR T0, B6
+ MOVOU (16*7)(ptx), T0
+ PXOR T0, B7
+
+ MOVOU B0, (16*0)(ctx)
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+ MOVOU B1, (16*1)(ctx)
+ PSHUFB BSWAP, B1
+ MOVOU B2, (16*2)(ctx)
+ PSHUFB BSWAP, B2
+ MOVOU B3, (16*3)(ctx)
+ PSHUFB BSWAP, B3
+ MOVOU B4, (16*4)(ctx)
+ PSHUFB BSWAP, B4
+ MOVOU B5, (16*5)(ctx)
+ PSHUFB BSWAP, B5
+ MOVOU B6, (16*6)(ctx)
+ PSHUFB BSWAP, B6
+ MOVOU B7, (16*7)(ctx)
+ PSHUFB BSWAP, B7
+
+ MOVOU B0, (16*0)(SP)
+ MOVOU B1, (16*1)(SP)
+ MOVOU B2, (16*2)(SP)
+ MOVOU B3, (16*3)(SP)
+ MOVOU B4, (16*4)(SP)
+ MOVOU B5, (16*5)(SP)
+ MOVOU B6, (16*6)(SP)
+ MOVOU B7, (16*7)(SP)
+
+ LEAQ 128(ptx), ptx
+ LEAQ 128(ctx), ctx
+
+gcmAesEncOctetsLoop:
+
+ CMPQ ptxLen, $128
+ JB gcmAesEncOctetsEnd
+ SUBQ $128, ptxLen
+
+ MOVOU (8*16 + 0*16)(SP), B0
+ MOVOU (8*16 + 1*16)(SP), B1
+ MOVOU (8*16 + 2*16)(SP), B2
+ MOVOU (8*16 + 3*16)(SP), B3
+ MOVOU (8*16 + 4*16)(SP), B4
+ MOVOU (8*16 + 5*16)(SP), B5
+ MOVOU (8*16 + 6*16)(SP), B6
+ MOVOU (8*16 + 7*16)(SP), B7
+
+ MOVOU (16*0)(SP), T0
+ PSHUFD $78, T0, T1
+ PXOR T0, T1
+
+ MOVOU (16*0)(pTbl), ACC0
+ MOVOU (16*1)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+
+ PCLMULQDQ $0x00, T1, ACCM
+ PCLMULQDQ $0x00, T0, ACC0
+ PCLMULQDQ $0x11, T0, ACC1
+
+ combinedRound(1)
+ increment(0)
+ combinedRound(2)
+ increment(1)
+ combinedRound(3)
+ increment(2)
+ combinedRound(4)
+ increment(3)
+ combinedRound(5)
+ increment(4)
+ combinedRound(6)
+ increment(5)
+ combinedRound(7)
+ increment(6)
+
+ aesRound(8)
+ increment(7)
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ aesRound(9)
+
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB encLast2
+ aesRnd(T0)
+ aesRound(11)
+ MOVOU (16*12)(ks), T0
+ JE encLast2
+ aesRnd(T0)
+ aesRound(13)
+ MOVOU (16*14)(ks), T0
+encLast2:
+ aesRndLast(T0)
+
+ MOVOU (16*0)(ptx), T0
+ PXOR T0, B0
+ MOVOU (16*1)(ptx), T0
+ PXOR T0, B1
+ MOVOU (16*2)(ptx), T0
+ PXOR T0, B2
+ MOVOU (16*3)(ptx), T0
+ PXOR T0, B3
+ MOVOU (16*4)(ptx), T0
+ PXOR T0, B4
+ MOVOU (16*5)(ptx), T0
+ PXOR T0, B5
+ MOVOU (16*6)(ptx), T0
+ PXOR T0, B6
+ MOVOU (16*7)(ptx), T0
+ PXOR T0, B7
+
+ MOVOU B0, (16*0)(ctx)
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+ MOVOU B1, (16*1)(ctx)
+ PSHUFB BSWAP, B1
+ MOVOU B2, (16*2)(ctx)
+ PSHUFB BSWAP, B2
+ MOVOU B3, (16*3)(ctx)
+ PSHUFB BSWAP, B3
+ MOVOU B4, (16*4)(ctx)
+ PSHUFB BSWAP, B4
+ MOVOU B5, (16*5)(ctx)
+ PSHUFB BSWAP, B5
+ MOVOU B6, (16*6)(ctx)
+ PSHUFB BSWAP, B6
+ MOVOU B7, (16*7)(ctx)
+ PSHUFB BSWAP, B7
+
+ MOVOU B0, (16*0)(SP)
+ MOVOU B1, (16*1)(SP)
+ MOVOU B2, (16*2)(SP)
+ MOVOU B3, (16*3)(SP)
+ MOVOU B4, (16*4)(SP)
+ MOVOU B5, (16*5)(SP)
+ MOVOU B6, (16*6)(SP)
+ MOVOU B7, (16*7)(SP)
+
+ LEAQ 128(ptx), ptx
+ LEAQ 128(ctx), ctx
+
+ JMP gcmAesEncOctetsLoop
+
+gcmAesEncOctetsEnd:
+
+ MOVOU (16*0)(SP), T0
+ MOVOU (16*0)(pTbl), ACC0
+ MOVOU (16*1)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+ PSHUFD $78, T0, T1
+ PXOR T0, T1
+ PCLMULQDQ $0x00, T0, ACC0
+ PCLMULQDQ $0x11, T0, ACC1
+ PCLMULQDQ $0x00, T1, ACCM
+
+ mulRound(1)
+ mulRound(2)
+ mulRound(3)
+ mulRound(4)
+ mulRound(5)
+ mulRound(6)
+ mulRound(7)
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ TESTQ ptxLen, ptxLen
+ JE gcmAesEncDone
+
+ SUBQ $7, aluCTR
+
+gcmAesEncSingles:
+
+ MOVOU (16*1)(ks), B1
+ MOVOU (16*2)(ks), B2
+ MOVOU (16*3)(ks), B3
+ MOVOU (16*4)(ks), B4
+ MOVOU (16*5)(ks), B5
+ MOVOU (16*6)(ks), B6
+ MOVOU (16*7)(ks), B7
+
+ MOVOU (16*14)(pTbl), T2
+
+gcmAesEncSinglesLoop:
+
+ CMPQ ptxLen, $16
+ JB gcmAesEncTail
+ SUBQ $16, ptxLen
+
+ MOVOU (8*16 + 0*16)(SP), B0
+ increment(0)
+
+ AESENC B1, B0
+ AESENC B2, B0
+ AESENC B3, B0
+ AESENC B4, B0
+ AESENC B5, B0
+ AESENC B6, B0
+ AESENC B7, B0
+ MOVOU (16*8)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*9)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB encLast3
+ AESENC T0, B0
+ MOVOU (16*11)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*12)(ks), T0
+ JE encLast3
+ AESENC T0, B0
+ MOVOU (16*13)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*14)(ks), T0
+encLast3:
+ AESENCLAST T0, B0
+
+ MOVOU (ptx), T0
+ PXOR T0, B0
+ MOVOU B0, (ctx)
+
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+
+ MOVOU T2, ACC0
+ MOVOU T2, ACC1
+ MOVOU (16*15)(pTbl), ACCM
+
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ LEAQ (16*1)(ptx), ptx
+ LEAQ (16*1)(ctx), ctx
+
+ JMP gcmAesEncSinglesLoop
+
+gcmAesEncTail:
+ TESTQ ptxLen, ptxLen
+ JE gcmAesEncDone
+
+ MOVOU (8*16 + 0*16)(SP), B0
+ AESENC B1, B0
+ AESENC B2, B0
+ AESENC B3, B0
+ AESENC B4, B0
+ AESENC B5, B0
+ AESENC B6, B0
+ AESENC B7, B0
+ MOVOU (16*8)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*9)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB encLast4
+ AESENC T0, B0
+ MOVOU (16*11)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*12)(ks), T0
+ JE encLast4
+ AESENC T0, B0
+ MOVOU (16*13)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*14)(ks), T0
+encLast4:
+ AESENCLAST T0, B0
+ MOVOU B0, T0
+
+ LEAQ -1(ptx)(ptxLen*1), ptx
+
+ MOVQ ptxLen, aluTMP
+ SHLQ $4, aluTMP
+
+ LEAQ andMask<>(SB), aluCTR
+ MOVOU -16(aluCTR)(aluTMP*1), T1
+
+ PXOR B0, B0
+ptxLoadLoop:
+ PSLLDQ $1, B0
+ PINSRB $0, (ptx), B0
+ LEAQ -1(ptx), ptx
+ DECQ ptxLen
+ JNE ptxLoadLoop
+
+ PXOR T0, B0
+ PAND T1, B0
+ MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
+
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+
+ MOVOU T2, ACC0
+ MOVOU T2, ACC1
+ MOVOU (16*15)(pTbl), ACCM
+
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+gcmAesEncDone:
+ MOVOU ACC0, (tPtr)
+ RET
+#undef increment
+
+// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesDec(SB),0,$128-96
+#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
+#define combinedDecRound(i) \
+ MOVOU (16*i)(ks), T0;\
+ AESENC T0, B0;\
+ AESENC T0, B1;\
+ AESENC T0, B2;\
+ AESENC T0, B3;\
+ MOVOU (16*(i*2))(pTbl), T1;\
+ MOVOU T1, T2;\
+ AESENC T0, B4;\
+ AESENC T0, B5;\
+ AESENC T0, B6;\
+ AESENC T0, B7;\
+ MOVOU (16*i)(ctx), T0;\
+ PSHUFB BSWAP, T0;\
+ PCLMULQDQ $0x00, T0, T1;\
+ PXOR T1, ACC0;\
+ PSHUFD $78, T0, T1;\
+ PCLMULQDQ $0x11, T0, T2;\
+ PXOR T1, T0;\
+ PXOR T2, ACC1;\
+ MOVOU (16*(i*2+1))(pTbl), T2;\
+ PCLMULQDQ $0x00, T2, T0;\
+ PXOR T0, ACCM
+
+ MOVQ productTable+0(FP), pTbl
+ MOVQ dst+8(FP), ptx
+ MOVQ src_base+32(FP), ctx
+ MOVQ src_len+40(FP), ptxLen
+ MOVQ ctr+56(FP), ctrPtr
+ MOVQ T+64(FP), tPtr
+ MOVQ ks_base+72(FP), ks
+ MOVQ ks_len+80(FP), NR
+
+ SHRQ $2, NR
+ DECQ NR
+
+ MOVOU bswapMask<>(SB), BSWAP
+ MOVOU gcmPoly<>(SB), POLY
+
+ MOVOU (tPtr), ACC0
+ PXOR ACC1, ACC1
+ PXOR ACCM, ACCM
+ MOVOU (ctrPtr), B0
+ MOVL (3*4)(ctrPtr), aluCTR
+ MOVOU (ks), T0
+ MOVL (3*4)(ks), aluK
+ BSWAPL aluCTR
+ BSWAPL aluK
+
+ PXOR B0, T0
+ MOVOU T0, (0*16)(SP)
+ increment(0)
+
+ CMPQ ptxLen, $128
+ JB gcmAesDecSingles
+
+ MOVOU T0, (1*16)(SP)
+ increment(1)
+ MOVOU T0, (2*16)(SP)
+ increment(2)
+ MOVOU T0, (3*16)(SP)
+ increment(3)
+ MOVOU T0, (4*16)(SP)
+ increment(4)
+ MOVOU T0, (5*16)(SP)
+ increment(5)
+ MOVOU T0, (6*16)(SP)
+ increment(6)
+ MOVOU T0, (7*16)(SP)
+ increment(7)
+
+gcmAesDecOctetsLoop:
+
+ CMPQ ptxLen, $128
+ JB gcmAesDecEndOctets
+ SUBQ $128, ptxLen
+
+ MOVOU (0*16)(SP), B0
+ MOVOU (1*16)(SP), B1
+ MOVOU (2*16)(SP), B2
+ MOVOU (3*16)(SP), B3
+ MOVOU (4*16)(SP), B4
+ MOVOU (5*16)(SP), B5
+ MOVOU (6*16)(SP), B6
+ MOVOU (7*16)(SP), B7
+
+ MOVOU (16*0)(ctx), T0
+ PSHUFB BSWAP, T0
+ PXOR ACC0, T0
+ PSHUFD $78, T0, T1
+ PXOR T0, T1
+
+ MOVOU (16*0)(pTbl), ACC0
+ MOVOU (16*1)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+
+ PCLMULQDQ $0x00, T1, ACCM
+ PCLMULQDQ $0x00, T0, ACC0
+ PCLMULQDQ $0x11, T0, ACC1
+
+ combinedDecRound(1)
+ increment(0)
+ combinedDecRound(2)
+ increment(1)
+ combinedDecRound(3)
+ increment(2)
+ combinedDecRound(4)
+ increment(3)
+ combinedDecRound(5)
+ increment(4)
+ combinedDecRound(6)
+ increment(5)
+ combinedDecRound(7)
+ increment(6)
+
+ aesRound(8)
+ increment(7)
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ aesRound(9)
+
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB decLast1
+ aesRnd(T0)
+ aesRound(11)
+ MOVOU (16*12)(ks), T0
+ JE decLast1
+ aesRnd(T0)
+ aesRound(13)
+ MOVOU (16*14)(ks), T0
+decLast1:
+ aesRndLast(T0)
+
+ MOVOU (16*0)(ctx), T0
+ PXOR T0, B0
+ MOVOU (16*1)(ctx), T0
+ PXOR T0, B1
+ MOVOU (16*2)(ctx), T0
+ PXOR T0, B2
+ MOVOU (16*3)(ctx), T0
+ PXOR T0, B3
+ MOVOU (16*4)(ctx), T0
+ PXOR T0, B4
+ MOVOU (16*5)(ctx), T0
+ PXOR T0, B5
+ MOVOU (16*6)(ctx), T0
+ PXOR T0, B6
+ MOVOU (16*7)(ctx), T0
+ PXOR T0, B7
+
+ MOVOU B0, (16*0)(ptx)
+ MOVOU B1, (16*1)(ptx)
+ MOVOU B2, (16*2)(ptx)
+ MOVOU B3, (16*3)(ptx)
+ MOVOU B4, (16*4)(ptx)
+ MOVOU B5, (16*5)(ptx)
+ MOVOU B6, (16*6)(ptx)
+ MOVOU B7, (16*7)(ptx)
+
+ LEAQ 128(ptx), ptx
+ LEAQ 128(ctx), ctx
+
+ JMP gcmAesDecOctetsLoop
+
+gcmAesDecEndOctets:
+
+ SUBQ $7, aluCTR
+
+gcmAesDecSingles:
+
+ MOVOU (16*1)(ks), B1
+ MOVOU (16*2)(ks), B2
+ MOVOU (16*3)(ks), B3
+ MOVOU (16*4)(ks), B4
+ MOVOU (16*5)(ks), B5
+ MOVOU (16*6)(ks), B6
+ MOVOU (16*7)(ks), B7
+
+ MOVOU (16*14)(pTbl), T2
+
+gcmAesDecSinglesLoop:
+
+ CMPQ ptxLen, $16
+ JB gcmAesDecTail
+ SUBQ $16, ptxLen
+
+ MOVOU (ctx), B0
+ MOVOU B0, T1
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+
+ MOVOU T2, ACC0
+ MOVOU T2, ACC1
+ MOVOU (16*15)(pTbl), ACCM
+
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ MOVOU (0*16)(SP), B0
+ increment(0)
+ AESENC B1, B0
+ AESENC B2, B0
+ AESENC B3, B0
+ AESENC B4, B0
+ AESENC B5, B0
+ AESENC B6, B0
+ AESENC B7, B0
+ MOVOU (16*8)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*9)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB decLast2
+ AESENC T0, B0
+ MOVOU (16*11)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*12)(ks), T0
+ JE decLast2
+ AESENC T0, B0
+ MOVOU (16*13)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*14)(ks), T0
+decLast2:
+ AESENCLAST T0, B0
+
+ PXOR T1, B0
+ MOVOU B0, (ptx)
+
+ LEAQ (16*1)(ptx), ptx
+ LEAQ (16*1)(ctx), ctx
+
+ JMP gcmAesDecSinglesLoop
+
+gcmAesDecTail:
+
+ TESTQ ptxLen, ptxLen
+ JE gcmAesDecDone
+
+ MOVQ ptxLen, aluTMP
+ SHLQ $4, aluTMP
+ LEAQ andMask<>(SB), aluCTR
+ MOVOU -16(aluCTR)(aluTMP*1), T1
+
+ MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
+ PAND T1, B0
+
+ MOVOU B0, T1
+ PSHUFB BSWAP, B0
+ PXOR ACC0, B0
+
+ MOVOU (16*14)(pTbl), ACC0
+ MOVOU (16*15)(pTbl), ACCM
+ MOVOU ACC0, ACC1
+
+ PCLMULQDQ $0x00, B0, ACC0
+ PCLMULQDQ $0x11, B0, ACC1
+ PSHUFD $78, B0, T0
+ PXOR B0, T0
+ PCLMULQDQ $0x00, T0, ACCM
+
+ PXOR ACC0, ACCM
+ PXOR ACC1, ACCM
+ MOVOU ACCM, T0
+ PSRLDQ $8, ACCM
+ PSLLDQ $8, T0
+ PXOR ACCM, ACC1
+ PXOR T0, ACC0
+
+ reduceRound(ACC0)
+ reduceRound(ACC0)
+ PXOR ACC1, ACC0
+
+ MOVOU (0*16)(SP), B0
+ increment(0)
+ AESENC B1, B0
+ AESENC B2, B0
+ AESENC B3, B0
+ AESENC B4, B0
+ AESENC B5, B0
+ AESENC B6, B0
+ AESENC B7, B0
+ MOVOU (16*8)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*9)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*10)(ks), T0
+ CMPQ NR, $12
+ JB decLast3
+ AESENC T0, B0
+ MOVOU (16*11)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*12)(ks), T0
+ JE decLast3
+ AESENC T0, B0
+ MOVOU (16*13)(ks), T0
+ AESENC T0, B0
+ MOVOU (16*14)(ks), T0
+decLast3:
+ AESENCLAST T0, B0
+ PXOR T1, B0
+
+ptxStoreLoop:
+ PEXTRB $0, B0, (ptx)
+ PSRLDQ $1, B0
+ LEAQ 1(ptx), ptx
+ DECQ ptxLen
+
+ JNE ptxStoreLoop
+
+gcmAesDecDone:
+
+ MOVOU ACC0, (tPtr)
+ RET