summaryrefslogtreecommitdiffstats
path: root/src/crypto/aes/asm_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/crypto/aes/asm_arm64.s281
1 files changed, 281 insertions, 0 deletions
diff --git a/src/crypto/aes/asm_arm64.s b/src/crypto/aes/asm_arm64.s
new file mode 100644
index 0000000..13aee5c
--- /dev/null
+++ b/src/crypto/aes/asm_arm64.s
@@ -0,0 +1,281 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
+DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
+GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
+DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
+DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
+GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
+// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
+ MOVD nr+0(FP), R9
+ MOVD xk+8(FP), R10
+ MOVD dst+16(FP), R11
+ MOVD src+24(FP), R12
+
+ VLD1 (R12), [V0.B16]
+
+ CMP $12, R9
+ BLT enc128
+ BEQ enc196
+enc256:
+ VLD1.P 32(R10), [V1.B16, V2.B16]
+ AESE V1.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V2.B16, V0.B16
+ AESMC V0.B16, V0.B16
+enc196:
+ VLD1.P 32(R10), [V3.B16, V4.B16]
+ AESE V3.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V4.B16, V0.B16
+ AESMC V0.B16, V0.B16
+enc128:
+ VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
+ VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
+ VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16]
+ AESE V5.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V6.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V7.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V8.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V9.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V10.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V11.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V12.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V13.B16, V0.B16
+ AESMC V0.B16, V0.B16
+ AESE V14.B16, V0.B16
+ VEOR V0.B16, V15.B16, V0.B16
+ VST1 [V0.B16], (R11)
+ RET
+
+// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
+ MOVD nr+0(FP), R9
+ MOVD xk+8(FP), R10
+ MOVD dst+16(FP), R11
+ MOVD src+24(FP), R12
+
+ VLD1 (R12), [V0.B16]
+
+ CMP $12, R9
+ BLT dec128
+ BEQ dec196
+dec256:
+ VLD1.P 32(R10), [V1.B16, V2.B16]
+ AESD V1.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V2.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+dec196:
+ VLD1.P 32(R10), [V3.B16, V4.B16]
+ AESD V3.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V4.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+dec128:
+ VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
+ VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
+ VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16]
+ AESD V5.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V6.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V7.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V8.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V9.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V10.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V11.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V12.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V13.B16, V0.B16
+ AESIMC V0.B16, V0.B16
+ AESD V14.B16, V0.B16
+ VEOR V0.B16, V15.B16, V0.B16
+ VST1 [V0.B16], (R11)
+ RET
+
+// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
+// Note that round keys are stored in uint128 format, not uint32
+TEXT ·expandKeyAsm(SB),NOSPLIT,$0
+ MOVD nr+0(FP), R8
+ MOVD key+8(FP), R9
+ MOVD enc+16(FP), R10
+ MOVD dec+24(FP), R11
+ LDP rotInvSRows<>(SB), (R0, R1)
+ VMOV R0, V3.D[0]
+ VMOV R1, V3.D[1]
+ VEOR V0.B16, V0.B16, V0.B16 // All zeroes
+ MOVW $1, R13
+ TBZ $1, R8, ks192
+ TBNZ $2, R8, ks256
+ LDPW (R9), (R4, R5)
+ LDPW 8(R9), (R6, R7)
+ STPW.P (R4, R5), 8(R10)
+ STPW.P (R6, R7), 8(R10)
+ MOVW $0x1b, R14
+ks128Loop:
+ VMOV R7, V2.S[0]
+ WORD $0x4E030042 // TBL V3.B16, [V2.B16], V2.B16
+ AESE V0.B16, V2.B16 // Use AES to compute the SBOX
+ EORW R13, R4
+ LSLW $1, R13 // Compute next Rcon
+ ANDSW $0x100, R13, ZR
+ CSELW NE, R14, R13, R13 // Fake modulo
+ SUBS $1, R8
+ VMOV V2.S[0], R0
+ EORW R0, R4
+ EORW R4, R5
+ EORW R5, R6
+ EORW R6, R7
+ STPW.P (R4, R5), 8(R10)
+ STPW.P (R6, R7), 8(R10)
+ BNE ks128Loop
+ CBZ R11, ksDone // If dec is nil we are done
+ SUB $176, R10
+ // Decryption keys are encryption keys with InverseMixColumns applied
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ VMOV V0.B16, V7.B16
+ AESIMC V1.B16, V6.B16
+ AESIMC V2.B16, V5.B16
+ AESIMC V3.B16, V4.B16
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ AESIMC V0.B16, V11.B16
+ AESIMC V1.B16, V10.B16
+ AESIMC V2.B16, V9.B16
+ AESIMC V3.B16, V8.B16
+ VLD1 (R10), [V0.B16, V1.B16, V2.B16]
+ AESIMC V0.B16, V14.B16
+ AESIMC V1.B16, V13.B16
+ VMOV V2.B16, V12.B16
+ VST1.P [V12.B16, V13.B16, V14.B16], 48(R11)
+ VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+ VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+ B ksDone
+ks192:
+ LDPW (R9), (R2, R3)
+ LDPW 8(R9), (R4, R5)
+ LDPW 16(R9), (R6, R7)
+ STPW.P (R2, R3), 8(R10)
+ STPW.P (R4, R5), 8(R10)
+ SUB $4, R8
+ks192Loop:
+ STPW.P (R6, R7), 8(R10)
+ VMOV R7, V2.S[0]
+ WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
+ AESE V0.B16, V2.B16
+ EORW R13, R2
+ LSLW $1, R13
+ SUBS $1, R8
+ VMOV V2.S[0], R0
+ EORW R0, R2
+ EORW R2, R3
+ EORW R3, R4
+ EORW R4, R5
+ EORW R5, R6
+ EORW R6, R7
+ STPW.P (R2, R3), 8(R10)
+ STPW.P (R4, R5), 8(R10)
+ BNE ks192Loop
+ CBZ R11, ksDone
+ SUB $208, R10
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ VMOV V0.B16, V7.B16
+ AESIMC V1.B16, V6.B16
+ AESIMC V2.B16, V5.B16
+ AESIMC V3.B16, V4.B16
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ AESIMC V0.B16, V11.B16
+ AESIMC V1.B16, V10.B16
+ AESIMC V2.B16, V9.B16
+ AESIMC V3.B16, V8.B16
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ AESIMC V0.B16, V15.B16
+ AESIMC V1.B16, V14.B16
+ AESIMC V2.B16, V13.B16
+ AESIMC V3.B16, V12.B16
+ VLD1 (R10), [V0.B16]
+ VST1.P [V0.B16], 16(R11)
+ VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+ VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+ VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+ B ksDone
+ks256:
+ LDP invSRows<>(SB), (R0, R1)
+ VMOV R0, V4.D[0]
+ VMOV R1, V4.D[1]
+ LDPW (R9), (R0, R1)
+ LDPW 8(R9), (R2, R3)
+ LDPW 16(R9), (R4, R5)
+ LDPW 24(R9), (R6, R7)
+ STPW.P (R0, R1), 8(R10)
+ STPW.P (R2, R3), 8(R10)
+ SUB $7, R8
+ks256Loop:
+ STPW.P (R4, R5), 8(R10)
+ STPW.P (R6, R7), 8(R10)
+ VMOV R7, V2.S[0]
+ WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
+ AESE V0.B16, V2.B16
+ EORW R13, R0
+ LSLW $1, R13
+ SUBS $1, R8
+ VMOV V2.S[0], R9
+ EORW R9, R0
+ EORW R0, R1
+ EORW R1, R2
+ EORW R2, R3
+ VMOV R3, V2.S[0]
+ WORD $0x4E040042 //TBL V3.B16, [V2.B16], V2.B16
+ AESE V0.B16, V2.B16
+ VMOV V2.S[0], R9
+ EORW R9, R4
+ EORW R4, R5
+ EORW R5, R6
+ EORW R6, R7
+ STPW.P (R0, R1), 8(R10)
+ STPW.P (R2, R3), 8(R10)
+ BNE ks256Loop
+ CBZ R11, ksDone
+ SUB $240, R10
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ VMOV V0.B16, V7.B16
+ AESIMC V1.B16, V6.B16
+ AESIMC V2.B16, V5.B16
+ AESIMC V3.B16, V4.B16
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ AESIMC V0.B16, V11.B16
+ AESIMC V1.B16, V10.B16
+ AESIMC V2.B16, V9.B16
+ AESIMC V3.B16, V8.B16
+ VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+ AESIMC V0.B16, V15.B16
+ AESIMC V1.B16, V14.B16
+ AESIMC V2.B16, V13.B16
+ AESIMC V3.B16, V12.B16
+ VLD1 (R10), [V0.B16, V1.B16, V2.B16]
+ AESIMC V0.B16, V18.B16
+ AESIMC V1.B16, V17.B16
+ VMOV V2.B16, V16.B16
+ VST1.P [V16.B16, V17.B16, V18.B16], 48(R11)
+ VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+ VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+ VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+ksDone:
+ RET