diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
commit | 43a123c1ae6613b3efeed291fa552ecd909d3acf (patch) | |
tree | fd92518b7024bc74031f78a1cf9e454b65e73665 /src/crypto/aes/asm_ppc64x.s | |
parent | Initial commit. (diff) | |
download | golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip |
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/aes/asm_ppc64x.s')
-rw-r--r-- | src/crypto/aes/asm_ppc64x.s | 675 |
1 files changed, 675 insertions, 0 deletions
diff --git a/src/crypto/aes/asm_ppc64x.s b/src/crypto/aes/asm_ppc64x.s new file mode 100644 index 0000000..8ac97ec --- /dev/null +++ b/src/crypto/aes/asm_ppc64x.s @@ -0,0 +1,675 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +// Original code can be found at the link below: +// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl + +// Some function names were changed to be consistent with Go function +// names. For instance, function aes_p8_set_{en,de}crypt_key become +// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts +// and a new session was created (doEncryptKeyAsm). This was necessary to +// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm. +// There were other modifications as well but kept the same functionality. + +#include "textflag.h" + +// For expandKeyAsm +#define INP R3 +#define BITS R4 +#define OUTENC R5 // Pointer to next expanded encrypt key +#define PTR R6 +#define CNT R7 +#define ROUNDS R8 +#define OUTDEC R9 // Pointer to next expanded decrypt key +#define TEMP R19 +#define ZERO V0 +#define IN0 V1 +#define IN1 V2 +#define KEY V3 +#define RCON V4 +#define MASK V5 +#define TMP V6 +#define STAGE V7 +#define OUTPERM V8 +#define OUTMASK V9 +#define OUTHEAD V10 +#define OUTTAIL V11 + +// For P9 instruction emulation +#define ESPERM V21 // Endian swapping permute into BE +#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXVB16X + +// For {en,de}cryptBlockAsm +#define BLK_INP R3 +#define BLK_OUT R4 +#define BLK_KEY R5 +#define BLK_ROUNDS R6 +#define BLK_IDX R7 + +DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap +DATA ·rcon+0x08(SB)/8, $0x0706050403020100 +DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON +DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON +DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000 +DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000 +DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK +DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK +DATA ·rcon+0x40(SB)/8, $0x0000000000000000 +DATA ·rcon+0x48(SB)/8, $0x0000000000000000 +GLOBL ·rcon(SB), RODATA, $80 + +#ifdef GOARCH_ppc64le +# ifdef GOPPC64_power9 +#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA+RB), VT +#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA+RB) +#define XXBRD_ON_LE(VA,VT) XXBRD VA, VT +# else +// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned +// doublewords and byte-swapping each doubleword to emulate BE load/stores. +#define NEEDS_ESPERM +#define P8_LXVB16X(RA,RB,VT) \ + LXVD2X (RA+RB), VT \ + VPERM VT, VT, ESPERM, VT + +#define P8_STXVB16X(VS,RA,RB) \ + VPERM VS, VS, ESPERM, TMP2 \ + STXVD2X TMP2, (RA+RB) + +#define XXBRD_ON_LE(VA,VT) \ + VPERM VA, VA, ESPERM, VT + +# endif // defined(GOPPC64_power9) +#else +#define P8_LXVB16X(RA,RB,VT) LXVD2X (RA+RB), VT +#define P8_STXVB16X(VS,RA,RB) STXVD2X VS, (RA+RB) +#define XXBRD_ON_LE(VA, VT) +#endif // defined(GOARCH_ppc64le) + +// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32) +TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0 + // Load the arguments inside the registers + MOVD nr+0(FP), ROUNDS + MOVD key+8(FP), INP + MOVD enc+16(FP), OUTENC + MOVD dec+24(FP), OUTDEC + +#ifdef NEEDS_ESPERM + MOVD $·rcon(SB), PTR // PTR points to rcon addr + LVX (PTR), ESPERM + ADD $0x10, PTR +#else + MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector) +#endif + + // Get key from memory and write aligned into VR + P8_LXVB16X(INP, R0, IN0) + ADD $0x10, INP, INP + MOVD $0x20, TEMP + + CMPW ROUNDS, $12 + LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON + LVX (PTR)(TEMP), MASK + ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON + MOVD $8, CNT // li 7,8 CNT = 8 + VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :) + MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds) + + // The expanded decrypt key is the expanded encrypt key stored in reverse order. + // Move OUTDEC to the last key location, and store in descending order. + ADD $160, OUTDEC, OUTDEC + BLT loop128 + ADD $32, OUTDEC, OUTDEC + BEQ l192 + ADD $32, OUTDEC, OUTDEC + JMP l256 + +loop128: + // Key schedule (Round 1 to 8) + VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + BC 0x10, 0, loop128 // bdnz .Loop128 + + LVX (PTR)(R0), RCON // lvx 4,0,6 Last two round keys + + // Key schedule (Round 9) + VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + // Key schedule (Round 10) + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + + VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + // Key schedule (Round 11) + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + + RET + +l192: + LXSDX (INP+R0), IN1 // Load next 8 bytes into upper half of VSR. + XXBRD_ON_LE(IN1, IN1) // and convert to BE ordering on LE hosts. + MOVD $4, CNT // li 7,4 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + VSPLTISB $8, KEY // vspltisb 3,8 + MOVD CNT, CTR // mtctr 7 + VSUBUBM MASK, KEY, MASK // vsububm 5,5,3 + +loop192: + VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5 + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + + VSLDOI $8, ZERO, IN1, STAGE // vsldoi 7,0,2,8 + VSPLTW $3, IN0, TMP // vspltw 6,1,3 + VXOR TMP, IN1, TMP // vxor 6,6,2 + VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12 + VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 + VXOR IN1, TMP, IN1 // vxor 2,2,6 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + VXOR IN1, KEY, IN1 // vxor 2,2,3 + VSLDOI $8, STAGE, IN0, STAGE // vsldoi 7,7,1,8 + + VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5 + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + STXVD2X STAGE, (R0+OUTENC) + STXVD2X STAGE, (R0+OUTDEC) + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + STXVD2X STAGE, (R0+OUTENC) + STXVD2X STAGE, (R0+OUTDEC) + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + VSPLTW $3, IN0, TMP // vspltw 6,1,3 + VXOR TMP, IN1, TMP // vxor 6,6,2 + VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12 + VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 + VXOR IN1, TMP, IN1 // vxor 2,2,6 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + VXOR IN1, KEY, IN1 // vxor 2,2,3 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + BC 0x10, 0, loop192 // bdnz .Loop192 + + RET + +l256: + P8_LXVB16X(INP, R0, IN1) + MOVD $7, CNT // li 7,7 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + MOVD CNT, CTR // mtctr 7 + +loop256: + VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5 + VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12 + STXVD2X IN1, (R0+OUTENC) + STXVD2X IN1, (R0+OUTDEC) + VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4 + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN0, TMP, IN0 // vxor 1,1,6 + VADDUWM RCON, RCON, RCON // vadduwm 4,4,4 + VXOR IN0, KEY, IN0 // vxor 1,1,3 + STXVD2X IN0, (R0+OUTENC) + STXVD2X IN0, (R0+OUTDEC) + ADD $16, OUTENC, OUTENC + ADD $-16, OUTDEC, OUTDEC + BC 0x12, 0, done // bdz .Ldone + + VSPLTW $3, IN0, KEY // vspltw 3,1,3 + VSLDOI $12, ZERO, IN1, TMP // vsldoi 6,0,2,12 + VSBOX KEY, KEY // vsbox 3,3 + + VXOR IN1, TMP, IN1 // vxor 2,2,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN1, TMP, IN1 // vxor 2,2,6 + VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12 + VXOR IN1, TMP, IN1 // vxor 2,2,6 + + VXOR IN1, KEY, IN1 // vxor 2,2,3 + JMP loop256 // b .Loop256 + +done: + RET + +// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0 + MOVD nr+0(FP), R6 // Round count/Key size + MOVD xk+8(FP), R5 // Key pointer + MOVD dst+16(FP), R3 // Dest pointer + MOVD src+24(FP), R4 // Src pointer +#ifdef NEEDS_ESPERM + MOVD $·rcon(SB), R7 + LVX (R7), ESPERM // Permute value for P8_ macros. +#endif + + // Set CR{1,2,3}EQ to hold the key size information. + CMPU R6, $10, CR1 + CMPU R6, $12, CR2 + CMPU R6, $14, CR3 + + MOVD $16, R6 + MOVD $32, R7 + MOVD $48, R8 + MOVD $64, R9 + MOVD $80, R10 + MOVD $96, R11 + MOVD $112, R12 + + // Load text in BE order + P8_LXVB16X(R4, R0, V0) + + // V1, V2 will hold keys, V0 is a temp. + // At completion, V2 will hold the ciphertext. + // Load xk[0:3] and xor with text + LXVD2X (R0+R5), V1 + VXOR V0, V1, V0 + + // Load xk[4:11] and cipher + LXVD2X (R6+R5), V1 + LXVD2X (R7+R5), V2 + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Load xk[12:19] and cipher + LXVD2X (R8+R5), V1 + LXVD2X (R9+R5), V2 + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Load xk[20:27] and cipher + LXVD2X (R10+R5), V1 + LXVD2X (R11+R5), V2 + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Increment xk pointer to reuse constant offsets in R6-R12. + ADD $112, R5 + + // Load xk[28:35] and cipher + LXVD2X (R0+R5), V1 + LXVD2X (R6+R5), V2 + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Load xk[36:43] and cipher + LXVD2X (R7+R5), V1 + LXVD2X (R8+R5), V2 + BEQ CR1, Ldec_tail // Key size 10? + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Load xk[44:51] and cipher + LXVD2X (R9+R5), V1 + LXVD2X (R10+R5), V2 + BEQ CR2, Ldec_tail // Key size 12? + VCIPHER V0, V1, V0 + VCIPHER V0, V2, V0 + + // Load xk[52:59] and cipher + LXVD2X (R11+R5), V1 + LXVD2X (R12+R5), V2 + BNE CR3, Linvalid_key_len // Not key size 14? + // Fallthrough to final cipher + +Ldec_tail: + // Cipher last two keys such that key information is + // cleared from V1 and V2. + VCIPHER V0, V1, V1 + VCIPHERLAST V1, V2, V2 + + // Store the result in BE order. + P8_STXVB16X(V2, R3, R0) + RET + +Linvalid_key_len: + // Segfault, this should never happen. Only 3 keys sizes are created/used. + MOVD R0, 0(R0) + RET + +// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte) +TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0 + MOVD nr+0(FP), R6 // Round count/Key size + MOVD xk+8(FP), R5 // Key pointer + MOVD dst+16(FP), R3 // Dest pointer + MOVD src+24(FP), R4 // Src pointer +#ifdef NEEDS_ESPERM + MOVD $·rcon(SB), R7 + LVX (R7), ESPERM // Permute value for P8_ macros. +#endif + + // Set CR{1,2,3}EQ to hold the key size information. + CMPU R6, $10, CR1 + CMPU R6, $12, CR2 + CMPU R6, $14, CR3 + + MOVD $16, R6 + MOVD $32, R7 + MOVD $48, R8 + MOVD $64, R9 + MOVD $80, R10 + MOVD $96, R11 + MOVD $112, R12 + + // Load text in BE order + P8_LXVB16X(R4, R0, V0) + + // V1, V2 will hold keys, V0 is a temp. + // At completion, V2 will hold the text. + // Load xk[0:3] and xor with ciphertext + LXVD2X (R0+R5), V1 + VXOR V0, V1, V0 + + // Load xk[4:11] and cipher + LXVD2X (R6+R5), V1 + LXVD2X (R7+R5), V2 + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Load xk[12:19] and cipher + LXVD2X (R8+R5), V1 + LXVD2X (R9+R5), V2 + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Load xk[20:27] and cipher + LXVD2X (R10+R5), V1 + LXVD2X (R11+R5), V2 + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Increment xk pointer to reuse constant offsets in R6-R12. + ADD $112, R5 + + // Load xk[28:35] and cipher + LXVD2X (R0+R5), V1 + LXVD2X (R6+R5), V2 + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Load xk[36:43] and cipher + LXVD2X (R7+R5), V1 + LXVD2X (R8+R5), V2 + BEQ CR1, Ldec_tail // Key size 10? + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Load xk[44:51] and cipher + LXVD2X (R9+R5), V1 + LXVD2X (R10+R5), V2 + BEQ CR2, Ldec_tail // Key size 12? + VNCIPHER V0, V1, V0 + VNCIPHER V0, V2, V0 + + // Load xk[52:59] and cipher + LXVD2X (R11+R5), V1 + LXVD2X (R12+R5), V2 + BNE CR3, Linvalid_key_len // Not key size 14? + // Fallthrough to final cipher + +Ldec_tail: + // Cipher last two keys such that key information is + // cleared from V1 and V2. + VNCIPHER V0, V1, V1 + VNCIPHERLAST V1, V2, V2 + + // Store the result in BE order. + P8_STXVB16X(V2, R3, R0) + RET + +Linvalid_key_len: + // Segfault, this should never happen. Only 3 keys sizes are created/used. + MOVD R0, 0(R0) + RET + +// Remove defines from above so they can be defined here +#undef INP +#undef OUTENC +#undef ROUNDS +#undef KEY +#undef TMP + +#define INP R3 +#define OUTP R4 +#define LEN R5 +#define KEYP R6 +#define ROUNDS R7 +#define IVP R8 +#define ENC R9 + +#define INOUT V2 +#define TMP V3 +#define IVEC V4 + +// Load the crypt key into VSRs. +// +// The expanded key is stored and loaded using +// STXVD2X/LXVD2X. The in-memory byte ordering +// depends on the endianness of the machine. The +// expanded keys are generated by expandKeyAsm above. +// +// Rkeyp holds the key pointer. It is clobbered. Once +// the expanded keys are loaded, it is not needed. +// +// R12,R14-R21 are scratch registers. +// For keyp of 10, V6, V11-V20 hold the expanded key. +// For keyp of 12, V6, V9-V20 hold the expanded key. +// For keyp of 14, V6, V7-V20 hold the expanded key. +#define LOAD_KEY(Rkeyp) \ + MOVD $16, R12 \ + MOVD $32, R14 \ + MOVD $48, R15 \ + MOVD $64, R16 \ + MOVD $80, R17 \ + MOVD $96, R18 \ + MOVD $112, R19 \ + MOVD $128, R20 \ + MOVD $144, R21 \ + LXVD2X (R0+Rkeyp), V6 \ + ADD $16, Rkeyp \ + BEQ CR1, L_start10 \ + BEQ CR2, L_start12 \ + LXVD2X (R0+Rkeyp), V7 \ + LXVD2X (R12+Rkeyp), V8 \ + ADD $32, Rkeyp \ + L_start12: \ + LXVD2X (R0+Rkeyp), V9 \ + LXVD2X (R12+Rkeyp), V10 \ + ADD $32, Rkeyp \ + L_start10: \ + LXVD2X (R0+Rkeyp), V11 \ + LXVD2X (R12+Rkeyp), V12 \ + LXVD2X (R14+Rkeyp), V13 \ + LXVD2X (R15+Rkeyp), V14 \ + LXVD2X (R16+Rkeyp), V15 \ + LXVD2X (R17+Rkeyp), V16 \ + LXVD2X (R18+Rkeyp), V17 \ + LXVD2X (R19+Rkeyp), V18 \ + LXVD2X (R20+Rkeyp), V19 \ + LXVD2X (R21+Rkeyp), V20 + +// Perform aes cipher operation for keysize 10/12/14 using the keys +// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ. +// +// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting +// performance V6 and IVEC can be swapped (xor is both associative and +// commutative) during encryption: +// +// VXOR INOUT, IVEC, INOUT +// VXOR INOUT, V6, INOUT +// +// into +// +// VXOR INOUT, V6, INOUT +// VXOR INOUT, IVEC, INOUT +// +#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \ + VXOR Vin, Vxor, Vout \ + BEQ CR1, label10 \ + BEQ CR2, label12 \ + vcipher Vout, V7, Vout \ + vcipher Vout, V8, Vout \ + label12: \ + vcipher Vout, V9, Vout \ + vcipher Vout, V10, Vout \ + label10: \ + vcipher Vout, V11, Vout \ + vcipher Vout, V12, Vout \ + vcipher Vout, V13, Vout \ + vcipher Vout, V14, Vout \ + vcipher Vout, V15, Vout \ + vcipher Vout, V16, Vout \ + vcipher Vout, V17, Vout \ + vcipher Vout, V18, Vout \ + vcipher Vout, V19, Vout \ + vciphel Vout, V20, Vout \ + +#define CLEAR_KEYS() \ + VXOR V6, V6, V6 \ + VXOR V7, V7, V7 \ + VXOR V8, V8, V8 \ + VXOR V9, V9, V9 \ + VXOR V10, V10, V10 \ + VXOR V11, V11, V11 \ + VXOR V12, V12, V12 \ + VXOR V13, V13, V13 \ + VXOR V14, V14, V14 \ + VXOR V15, V15, V15 \ + VXOR V16, V16, V16 \ + VXOR V17, V17, V17 \ + VXOR V18, V18, V18 \ + VXOR V19, V19, V19 \ + VXOR V20, V20, V20 + +//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int) +TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0 + MOVD src+0(FP), INP + MOVD dst+8(FP), OUTP + MOVD length+16(FP), LEN + MOVD key+24(FP), KEYP + MOVD iv+32(FP), IVP + MOVD enc+40(FP), ENC + MOVD nr+48(FP), ROUNDS + +#ifdef NEEDS_ESPERM + MOVD $·rcon(SB), R11 + LVX (R11), ESPERM // Permute value for P8_ macros. +#endif + + // Assume len > 0 && len % blockSize == 0. + CMPW ENC, $0 + P8_LXVB16X(IVP, R0, IVEC) + CMPU ROUNDS, $10, CR1 + CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported. + + // Setup key in VSRs, and set loop count in CTR. + LOAD_KEY(KEYP) + SRD $4, LEN + MOVD LEN, CTR + + BEQ Lcbc_dec + + PCALIGN $32 +Lcbc_enc: + P8_LXVB16X(INP, R0, INOUT) + ADD $16, INP + VXOR INOUT, V6, INOUT + CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12) + VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block. + P8_STXVB16X(INOUT, OUTP, R0) + ADD $16, OUTP + BDNZ Lcbc_enc + + P8_STXVB16X(INOUT, IVP, R0) + CLEAR_KEYS() + RET + + PCALIGN $32 +Lcbc_dec: + P8_LXVB16X(INP, R0, TMP) + ADD $16, INP + CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12) + VXOR INOUT, IVEC, INOUT + VOR TMP, TMP, IVEC // TMP is IVEC for next block. + P8_STXVB16X(INOUT, OUTP, R0) + ADD $16, OUTP + BDNZ Lcbc_dec + + P8_STXVB16X(IVEC, IVP, R0) + CLEAR_KEYS() + RET |