summaryrefslogtreecommitdiffstats
path: root/src/crypto/aes/asm_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/aes/asm_amd64.s')
-rw-r--r--src/crypto/aes/asm_amd64.s274
1 files changed, 274 insertions, 0 deletions
diff --git a/src/crypto/aes/asm_amd64.s b/src/crypto/aes/asm_amd64.s
new file mode 100644
index 0000000..ad871ec
--- /dev/null
+++ b/src/crypto/aes/asm_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
+ MOVQ nr+0(FP), CX
+ MOVQ xk+8(FP), AX
+ MOVQ dst+16(FP), DX
+ MOVQ src+24(FP), BX
+ MOVUPS 0(AX), X1
+ MOVUPS 0(BX), X0
+ ADDQ $16, AX
+ PXOR X1, X0
+ SUBQ $12, CX
+ JE Lenc196
+ JB Lenc128
+Lenc256:
+ MOVUPS 0(AX), X1
+ AESENC X1, X0
+ MOVUPS 16(AX), X1
+ AESENC X1, X0
+ ADDQ $32, AX
+Lenc196:
+ MOVUPS 0(AX), X1
+ AESENC X1, X0
+ MOVUPS 16(AX), X1
+ AESENC X1, X0
+ ADDQ $32, AX
+Lenc128:
+ MOVUPS 0(AX), X1
+ AESENC X1, X0
+ MOVUPS 16(AX), X1
+ AESENC X1, X0
+ MOVUPS 32(AX), X1
+ AESENC X1, X0
+ MOVUPS 48(AX), X1
+ AESENC X1, X0
+ MOVUPS 64(AX), X1
+ AESENC X1, X0
+ MOVUPS 80(AX), X1
+ AESENC X1, X0
+ MOVUPS 96(AX), X1
+ AESENC X1, X0
+ MOVUPS 112(AX), X1
+ AESENC X1, X0
+ MOVUPS 128(AX), X1
+ AESENC X1, X0
+ MOVUPS 144(AX), X1
+ AESENCLAST X1, X0
+ MOVUPS X0, 0(DX)
+ RET
+
+// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
+ MOVQ nr+0(FP), CX
+ MOVQ xk+8(FP), AX
+ MOVQ dst+16(FP), DX
+ MOVQ src+24(FP), BX
+ MOVUPS 0(AX), X1
+ MOVUPS 0(BX), X0
+ ADDQ $16, AX
+ PXOR X1, X0
+ SUBQ $12, CX
+ JE Ldec196
+ JB Ldec128
+Ldec256:
+ MOVUPS 0(AX), X1
+ AESDEC X1, X0
+ MOVUPS 16(AX), X1
+ AESDEC X1, X0
+ ADDQ $32, AX
+Ldec196:
+ MOVUPS 0(AX), X1
+ AESDEC X1, X0
+ MOVUPS 16(AX), X1
+ AESDEC X1, X0
+ ADDQ $32, AX
+Ldec128:
+ MOVUPS 0(AX), X1
+ AESDEC X1, X0
+ MOVUPS 16(AX), X1
+ AESDEC X1, X0
+ MOVUPS 32(AX), X1
+ AESDEC X1, X0
+ MOVUPS 48(AX), X1
+ AESDEC X1, X0
+ MOVUPS 64(AX), X1
+ AESDEC X1, X0
+ MOVUPS 80(AX), X1
+ AESDEC X1, X0
+ MOVUPS 96(AX), X1
+ AESDEC X1, X0
+ MOVUPS 112(AX), X1
+ AESDEC X1, X0
+ MOVUPS 128(AX), X1
+ AESDEC X1, X0
+ MOVUPS 144(AX), X1
+ AESDECLAST X1, X0
+ MOVUPS X0, 0(DX)
+ RET
+
+// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
+// Note that round keys are stored in uint128 format, not uint32
+TEXT ·expandKeyAsm(SB),NOSPLIT,$0
+ MOVQ nr+0(FP), CX
+ MOVQ key+8(FP), AX
+ MOVQ enc+16(FP), BX
+ MOVQ dec+24(FP), DX
+ MOVUPS (AX), X0
+ // enc
+ MOVUPS X0, (BX)
+ ADDQ $16, BX
+ PXOR X4, X4 // _expand_key_* expect X4 to be zero
+ CMPL CX, $12
+ JE Lexp_enc196
+ JB Lexp_enc128
+Lexp_enc256:
+ MOVUPS 16(AX), X2
+ MOVUPS X2, (BX)
+ ADDQ $16, BX
+ AESKEYGENASSIST $0x01, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x01, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x02, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x02, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x04, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x04, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x08, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x08, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x10, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x10, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x20, X2, X1
+ CALL _expand_key_256a<>(SB)
+ AESKEYGENASSIST $0x20, X0, X1
+ CALL _expand_key_256b<>(SB)
+ AESKEYGENASSIST $0x40, X2, X1
+ CALL _expand_key_256a<>(SB)
+ JMP Lexp_dec
+Lexp_enc196:
+ MOVQ 16(AX), X2
+ AESKEYGENASSIST $0x01, X2, X1
+ CALL _expand_key_192a<>(SB)
+ AESKEYGENASSIST $0x02, X2, X1
+ CALL _expand_key_192b<>(SB)
+ AESKEYGENASSIST $0x04, X2, X1
+ CALL _expand_key_192a<>(SB)
+ AESKEYGENASSIST $0x08, X2, X1
+ CALL _expand_key_192b<>(SB)
+ AESKEYGENASSIST $0x10, X2, X1
+ CALL _expand_key_192a<>(SB)
+ AESKEYGENASSIST $0x20, X2, X1
+ CALL _expand_key_192b<>(SB)
+ AESKEYGENASSIST $0x40, X2, X1
+ CALL _expand_key_192a<>(SB)
+ AESKEYGENASSIST $0x80, X2, X1
+ CALL _expand_key_192b<>(SB)
+ JMP Lexp_dec
+Lexp_enc128:
+ AESKEYGENASSIST $0x01, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x02, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x04, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x08, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x10, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x20, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x40, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x80, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x1b, X0, X1
+ CALL _expand_key_128<>(SB)
+ AESKEYGENASSIST $0x36, X0, X1
+ CALL _expand_key_128<>(SB)
+Lexp_dec:
+ // dec
+ SUBQ $16, BX
+ MOVUPS (BX), X1
+ MOVUPS X1, (DX)
+ DECQ CX
+Lexp_dec_loop:
+ MOVUPS -16(BX), X1
+ AESIMC X1, X0
+ MOVUPS X0, 16(DX)
+ SUBQ $16, BX
+ ADDQ $16, DX
+ DECQ CX
+ JNZ Lexp_dec_loop
+ MOVUPS -16(BX), X0
+ MOVUPS X0, 16(DX)
+ RET
+
+TEXT _expand_key_128<>(SB),NOSPLIT,$0
+ PSHUFD $0xff, X1, X1
+ SHUFPS $0x10, X0, X4
+ PXOR X4, X0
+ SHUFPS $0x8c, X0, X4
+ PXOR X4, X0
+ PXOR X1, X0
+ MOVUPS X0, (BX)
+ ADDQ $16, BX
+ RET
+
+TEXT _expand_key_192a<>(SB),NOSPLIT,$0
+ PSHUFD $0x55, X1, X1
+ SHUFPS $0x10, X0, X4
+ PXOR X4, X0
+ SHUFPS $0x8c, X0, X4
+ PXOR X4, X0
+ PXOR X1, X0
+
+ MOVAPS X2, X5
+ MOVAPS X2, X6
+ PSLLDQ $0x4, X5
+ PSHUFD $0xff, X0, X3
+ PXOR X3, X2
+ PXOR X5, X2
+
+ MOVAPS X0, X1
+ SHUFPS $0x44, X0, X6
+ MOVUPS X6, (BX)
+ SHUFPS $0x4e, X2, X1
+ MOVUPS X1, 16(BX)
+ ADDQ $32, BX
+ RET
+
+TEXT _expand_key_192b<>(SB),NOSPLIT,$0
+ PSHUFD $0x55, X1, X1
+ SHUFPS $0x10, X0, X4
+ PXOR X4, X0
+ SHUFPS $0x8c, X0, X4
+ PXOR X4, X0
+ PXOR X1, X0
+
+ MOVAPS X2, X5
+ PSLLDQ $0x4, X5
+ PSHUFD $0xff, X0, X3
+ PXOR X3, X2
+ PXOR X5, X2
+
+ MOVUPS X0, (BX)
+ ADDQ $16, BX
+ RET
+
+TEXT _expand_key_256a<>(SB),NOSPLIT,$0
+ JMP _expand_key_128<>(SB)
+
+TEXT _expand_key_256b<>(SB),NOSPLIT,$0
+ PSHUFD $0xaa, X1, X1
+ SHUFPS $0x10, X2, X4
+ PXOR X4, X2
+ SHUFPS $0x8c, X2, X4
+ PXOR X4, X2
+ PXOR X1, X2
+
+ MOVUPS X2, (BX)
+ ADDQ $16, BX
+ RET