diff options
Diffstat (limited to '')
-rw-r--r-- | src/crypto/aes/asm_arm64.s | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/src/crypto/aes/asm_arm64.s b/src/crypto/aes/asm_arm64.s new file mode 100644 index 0000000..13aee5c --- /dev/null +++ b/src/crypto/aes/asm_arm64.s @@ -0,0 +1,281 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" +DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01 +DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609 +GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16 +DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00 +DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508 +GLOBL invSRows<>(SB), (NOPTR+RODATA), $16 +// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 + MOVD nr+0(FP), R9 + MOVD xk+8(FP), R10 + MOVD dst+16(FP), R11 + MOVD src+24(FP), R12 + + VLD1 (R12), [V0.B16] + + CMP $12, R9 + BLT enc128 + BEQ enc196 +enc256: + VLD1.P 32(R10), [V1.B16, V2.B16] + AESE V1.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V2.B16, V0.B16 + AESMC V0.B16, V0.B16 +enc196: + VLD1.P 32(R10), [V3.B16, V4.B16] + AESE V3.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V4.B16, V0.B16 + AESMC V0.B16, V0.B16 +enc128: + VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16] + VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16] + VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16] + AESE V5.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V6.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V7.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V8.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V9.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V10.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V11.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V12.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V13.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V14.B16, V0.B16 + VEOR V0.B16, V15.B16, V0.B16 + VST1 [V0.B16], (R11) + RET + +// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·decryptBlockAsm(SB),NOSPLIT,$0 + MOVD nr+0(FP), R9 + MOVD xk+8(FP), R10 + MOVD dst+16(FP), R11 + MOVD src+24(FP), R12 + + VLD1 (R12), [V0.B16] + + CMP $12, R9 + BLT dec128 + BEQ dec196 +dec256: + VLD1.P 32(R10), [V1.B16, V2.B16] + AESD V1.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V2.B16, V0.B16 + AESIMC V0.B16, V0.B16 +dec196: + VLD1.P 32(R10), [V3.B16, V4.B16] + AESD V3.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V4.B16, V0.B16 + AESIMC V0.B16, V0.B16 +dec128: + VLD1.P 64(R10), [V5.B16, V6.B16, V7.B16, V8.B16] + VLD1.P 64(R10), [V9.B16, V10.B16, V11.B16, V12.B16] + VLD1.P 48(R10), [V13.B16, V14.B16, V15.B16] + AESD V5.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V6.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V7.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V8.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V9.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V10.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V11.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V12.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V13.B16, V0.B16 + AESIMC V0.B16, V0.B16 + AESD V14.B16, V0.B16 + VEOR V0.B16, V15.B16, V0.B16 + VST1 [V0.B16], (R11) + RET + +// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) { +// Note that round keys are stored in uint128 format, not uint32 +TEXT ·expandKeyAsm(SB),NOSPLIT,$0 + MOVD nr+0(FP), R8 + MOVD key+8(FP), R9 + MOVD enc+16(FP), R10 + MOVD dec+24(FP), R11 + LDP rotInvSRows<>(SB), (R0, R1) + VMOV R0, V3.D[0] + VMOV R1, V3.D[1] + VEOR V0.B16, V0.B16, V0.B16 // All zeroes + MOVW $1, R13 + TBZ $1, R8, ks192 + TBNZ $2, R8, ks256 + LDPW (R9), (R4, R5) + LDPW 8(R9), (R6, R7) + STPW.P (R4, R5), 8(R10) + STPW.P (R6, R7), 8(R10) + MOVW $0x1b, R14 +ks128Loop: + VMOV R7, V2.S[0] + WORD $0x4E030042 // TBL V3.B16, [V2.B16], V2.B16 + AESE V0.B16, V2.B16 // Use AES to compute the SBOX + EORW R13, R4 + LSLW $1, R13 // Compute next Rcon + ANDSW $0x100, R13, ZR + CSELW NE, R14, R13, R13 // Fake modulo + SUBS $1, R8 + VMOV V2.S[0], R0 + EORW R0, R4 + EORW R4, R5 + EORW R5, R6 + EORW R6, R7 + STPW.P (R4, R5), 8(R10) + STPW.P (R6, R7), 8(R10) + BNE ks128Loop + CBZ R11, ksDone // If dec is nil we are done + SUB $176, R10 + // Decryption keys are encryption keys with InverseMixColumns applied + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + VMOV V0.B16, V7.B16 + AESIMC V1.B16, V6.B16 + AESIMC V2.B16, V5.B16 + AESIMC V3.B16, V4.B16 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + AESIMC V0.B16, V11.B16 + AESIMC V1.B16, V10.B16 + AESIMC V2.B16, V9.B16 + AESIMC V3.B16, V8.B16 + VLD1 (R10), [V0.B16, V1.B16, V2.B16] + AESIMC V0.B16, V14.B16 + AESIMC V1.B16, V13.B16 + VMOV V2.B16, V12.B16 + VST1.P [V12.B16, V13.B16, V14.B16], 48(R11) + VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11) + VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11) + B ksDone +ks192: + LDPW (R9), (R2, R3) + LDPW 8(R9), (R4, R5) + LDPW 16(R9), (R6, R7) + STPW.P (R2, R3), 8(R10) + STPW.P (R4, R5), 8(R10) + SUB $4, R8 +ks192Loop: + STPW.P (R6, R7), 8(R10) + VMOV R7, V2.S[0] + WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16 + AESE V0.B16, V2.B16 + EORW R13, R2 + LSLW $1, R13 + SUBS $1, R8 + VMOV V2.S[0], R0 + EORW R0, R2 + EORW R2, R3 + EORW R3, R4 + EORW R4, R5 + EORW R5, R6 + EORW R6, R7 + STPW.P (R2, R3), 8(R10) + STPW.P (R4, R5), 8(R10) + BNE ks192Loop + CBZ R11, ksDone + SUB $208, R10 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + VMOV V0.B16, V7.B16 + AESIMC V1.B16, V6.B16 + AESIMC V2.B16, V5.B16 + AESIMC V3.B16, V4.B16 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + AESIMC V0.B16, V11.B16 + AESIMC V1.B16, V10.B16 + AESIMC V2.B16, V9.B16 + AESIMC V3.B16, V8.B16 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + AESIMC V0.B16, V15.B16 + AESIMC V1.B16, V14.B16 + AESIMC V2.B16, V13.B16 + AESIMC V3.B16, V12.B16 + VLD1 (R10), [V0.B16] + VST1.P [V0.B16], 16(R11) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11) + VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11) + VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11) + B ksDone +ks256: + LDP invSRows<>(SB), (R0, R1) + VMOV R0, V4.D[0] + VMOV R1, V4.D[1] + LDPW (R9), (R0, R1) + LDPW 8(R9), (R2, R3) + LDPW 16(R9), (R4, R5) + LDPW 24(R9), (R6, R7) + STPW.P (R0, R1), 8(R10) + STPW.P (R2, R3), 8(R10) + SUB $7, R8 +ks256Loop: + STPW.P (R4, R5), 8(R10) + STPW.P (R6, R7), 8(R10) + VMOV R7, V2.S[0] + WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16 + AESE V0.B16, V2.B16 + EORW R13, R0 + LSLW $1, R13 + SUBS $1, R8 + VMOV V2.S[0], R9 + EORW R9, R0 + EORW R0, R1 + EORW R1, R2 + EORW R2, R3 + VMOV R3, V2.S[0] + WORD $0x4E040042 //TBL V3.B16, [V2.B16], V2.B16 + AESE V0.B16, V2.B16 + VMOV V2.S[0], R9 + EORW R9, R4 + EORW R4, R5 + EORW R5, R6 + EORW R6, R7 + STPW.P (R0, R1), 8(R10) + STPW.P (R2, R3), 8(R10) + BNE ks256Loop + CBZ R11, ksDone + SUB $240, R10 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + VMOV V0.B16, V7.B16 + AESIMC V1.B16, V6.B16 + AESIMC V2.B16, V5.B16 + AESIMC V3.B16, V4.B16 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + AESIMC V0.B16, V11.B16 + AESIMC V1.B16, V10.B16 + AESIMC V2.B16, V9.B16 + AESIMC V3.B16, V8.B16 + VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16] + AESIMC V0.B16, V15.B16 + AESIMC V1.B16, V14.B16 + AESIMC V2.B16, V13.B16 + AESIMC V3.B16, V12.B16 + VLD1 (R10), [V0.B16, V1.B16, V2.B16] + AESIMC V0.B16, V18.B16 + AESIMC V1.B16, V17.B16 + VMOV V2.B16, V16.B16 + VST1.P [V16.B16, V17.B16, V18.B16], 48(R11) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11) + VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11) + VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11) +ksDone: + RET |