summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/ppc-gcm.s
diff options
context:
space:
mode:
Diffstat (limited to 'security/nss/lib/freebl/ppc-gcm.s')
-rw-r--r--security/nss/lib/freebl/ppc-gcm.s1051
1 files changed, 1051 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/ppc-gcm.s b/security/nss/lib/freebl/ppc-gcm.s
new file mode 100644
index 0000000000..06ad5862c1
--- /dev/null
+++ b/security/nss/lib/freebl/ppc-gcm.s
@@ -0,0 +1,1051 @@
+# This submission to NSS is to be made available under the terms of the
+# Mozilla Public License, v. 2.0. You can obtain one at //mozilla.org/MPL/2.0/
+# Copyright(c) 2021, Niels Möller and Mamone Tarsha
+
+# Registers:
+
+.set SP, 1
+.set TOCP, 2
+
+.macro VEC_LOAD_DATA VR, DATA, GPR
+ addis \GPR, 2, \DATA@got@ha
+ ld \GPR, \DATA@got@l(\GPR)
+ lvx \VR, 0, \GPR
+.endm
+
+.macro VEC_LOAD VR, GPR, IDX
+ lxvd2x \VR+32, \IDX, \GPR
+ vperm \VR, \VR, \VR, SWAP_MASK
+.endm
+
+.macro VEC_LOAD_INC VR, GPR, IDX
+ lxvd2x \VR+32, \IDX, \GPR
+ addi \IDX,\IDX,16
+ vperm \VR, \VR, \VR, SWAP_MASK
+.endm
+
+.macro VEC_STORE VR, GPR, IDX
+ vperm \VR, \VR, \VR, SWAP_MASK
+ stxvd2x \VR+32, \IDX, \GPR
+.endm
+
+# 0 < LEN < 16, pad the remaining bytes with zeros
+.macro LOAD_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
+ li \TMP0, 0
+ li \VAL1, 0
+ li \VAL0, 0
+ andi. \TMP1, \LEN, 8
+ beq 1f
+ ldbrx \VAL1, 0, \DATA
+ li \TMP0, 8
+1:
+ andi. \TMP1, \LEN, 7
+ beq 3f
+ li \TMP1, 56
+2:
+ lbzx \TMP2, \TMP0, \DATA
+ sld \TMP2, \TMP2, \TMP1
+ subi \TMP1, \TMP1, 8
+ or \VAL0, \VAL0, \TMP2
+ addi \TMP0, \TMP0, 1
+ cmpld \TMP0, \LEN
+ bne 2b
+ andi. \TMP1, \LEN, 8
+ bne 3f
+ mr \VAL1, \VAL0
+ li \VAL0, 0
+3:
+.endm
+
+# 0 < LEN < 16
+.macro STORE_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
+ andi. \TMP1, \LEN, 8
+ beq 1f
+ stdbrx \VAL1, 0, \DATA
+ li \TMP0, 8
+ b 2f
+1:
+ li \TMP0, 0
+ mr \VAL0, \VAL1
+2:
+ andi. \TMP1, \LEN, 7
+ beq 4f
+ li \TMP1, 56
+3:
+ srd \TMP2, \VAL0, \TMP1
+ subi \TMP1, \TMP1, 8
+ stbx \TMP2, \TMP0, \DATA
+ addi \TMP0, \TMP0, 1
+ cmpld \TMP0, \LEN
+ bne 3b
+4:
+.endm
+
+.text
+
+################################################################################
+# Generates the H table
+# void ppc_aes_gcmINIT(uint8_t Htbl[16*8], uint32_t *KS, int NR);
+.globl ppc_aes_gcmINIT
+.type ppc_aes_gcmINIT,@function
+.align 5
+ppc_aes_gcmINIT:
+addis TOCP,12,(.TOC.-ppc_aes_gcmINIT)@ha
+addi TOCP,TOCP,(.TOC.-ppc_aes_gcmINIT)@l
+.localentry ppc_aes_gcmINIT, .-ppc_aes_gcmINIT
+
+.set Htbl, 3
+.set KS, 4
+.set NR, 5
+
+.set ZERO, 19
+.set MSB, 18
+.set ONE, 17
+.set SWAP_MASK, 0
+.set POLY, 1
+.set K, 2
+.set H, 3
+.set H2, 4
+.set H3, 5
+.set H4, 6
+.set HP, 7
+.set HS, 8
+.set R, 9
+.set F, 10
+.set T, 11
+.set H1M, 12
+.set H1L, 13
+.set H2M, 14
+.set H2L, 15
+.set H3M, 16
+.set H3L, 17
+.set H4M, 18
+.set H4L, 19
+
+ VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 6
+ VEC_LOAD_DATA POLY, .Lpoly, 6
+
+ li 6, 0
+ VEC_LOAD_INC H, KS, 6
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ cmpwi NR, 10
+ beq .LH_done
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ cmpwi NR, 12
+ beq .LH_done
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+ VEC_LOAD_INC K, KS, 6
+ vcipher H, H, K
+
+.LH_done:
+ VEC_LOAD K, KS, 6
+ vcipherlast H, H, K
+
+ vupkhsb MSB, H
+ vspltisb ONE, 1
+ vspltb MSB, MSB, 0
+ vsl H, H, ONE
+ vand MSB, MSB, POLY
+ vxor ZERO, ZERO, ZERO
+ vxor H, H, MSB
+ vsldoi POLY, ZERO, POLY, 8
+
+ vpmsumd HP, H, POLY
+ vsldoi HS, H, H, 8
+ vxor HP, HP, HS
+ vsldoi H1L, HP, HS, 8
+ vsldoi H1M, HS, HP, 8
+ vsldoi H1L, H1L, H1L, 8
+
+ # calculate H^2
+
+ vpmsumd F, H, H1L
+ vpmsumd R, H, H1M
+
+ vpmsumd T, F, POLY
+ vsldoi H2, F, F, 8
+ vxor R, R, T
+ vxor H2, H2, R
+
+ vpmsumd HP, H2, POLY
+ vsldoi HS, H2, H2, 8
+ vxor HP, HP, HS
+ vsldoi H2L, HP, HS, 8
+ vsldoi H2M, HS, HP, 8
+ vsldoi H2L, H2L, H2L, 8
+
+ # calculate H^3
+
+ vpmsumd F, H2, H1L
+ vpmsumd R, H2, H1M
+
+ vpmsumd T, F, POLY
+ vsldoi H3, F, F, 8
+ vxor R, R, T
+ vxor H3, H3, R
+
+ vpmsumd HP, H3, POLY
+ vsldoi HS, H3, H3, 8
+ vxor HP, HP, HS
+ vsldoi H3L, HP, HS, 8
+ vsldoi H3M, HS, HP, 8
+ vsldoi H3L, H3L, H3L, 8
+
+ # calculate H^4
+
+ vpmsumd F, H2, H2L
+ vpmsumd R, H2, H2M
+
+ vpmsumd T, F, POLY
+ vsldoi H4, F, F, 8
+ vxor R, R, T
+ vxor H4, H4, R
+
+ vpmsumd HP, H4, POLY
+ vsldoi HS, H4, H4, 8
+ vxor HP, HP, HS
+ vsldoi H4L, HP, HS, 8
+ vsldoi H4M, HS, HP, 8
+ vsldoi H4L, H4L, H4L, 8
+
+ li 8, 16*1
+ li 9, 16*2
+ li 10, 16*3
+ stxvd2x H1L+32, 0, Htbl
+ stxvd2x H1M+32, 8, Htbl
+ stxvd2x H2L+32, 9, Htbl
+ stxvd2x H2M+32, 10, Htbl
+ li 7, 16*4
+ li 8, 16*5
+ li 9, 16*6
+ li 10, 16*7
+ stxvd2x H3L+32, 7, Htbl
+ stxvd2x H3M+32, 8, Htbl
+ stxvd2x H4L+32, 9, Htbl
+ stxvd2x H4M+32, 10, Htbl
+
+ blr
+.size ppc_aes_gcmINIT, . - ppc_aes_gcmINIT
+
+################################################################################
+# Authenticate only
+# void ppc_aes_gcmHASH(uint8_t Htbl[16*8], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
+.globl ppc_aes_gcmHASH
+.type ppc_aes_gcmHASH,@function
+.align 5
+ppc_aes_gcmHASH:
+addis TOCP,12,(.TOC.-ppc_aes_gcmHASH)@ha
+addi TOCP,TOCP,(.TOC.-ppc_aes_gcmHASH)@l
+.localentry ppc_aes_gcmHASH, .-ppc_aes_gcmHASH
+
+.set Htbl, 3
+.set AAD, 4
+.set Alen, 5
+.set Tp, 6
+
+.set SWAP_MASK, 0
+.set POLY, 1
+.set D, 2
+.set C0, 3
+.set C1, 4
+.set C2, 5
+.set C3, 6
+.set T, 7
+.set R, 8
+.set F, 9
+.set R2, 10
+.set F2, 11
+.set R3, 12
+.set F3, 13
+.set R4, 14
+.set F4, 15
+.set H1M, 16
+.set H1L, 17
+.set H2M, 18
+.set H2L, 19
+.set H3M, 28
+.set H3L, 29
+.set H4M, 30
+.set H4L, 31
+
+ # store non-volatile vector registers
+ addi 7, SP, -16
+ stvx 31, 0, 7
+ addi 7, SP, -32
+ stvx 30, 0, 7
+ addi 7, SP, -48
+ stvx 29, 0, 7
+ addi 7, SP, -64
+ stvx 28, 0, 7
+
+ VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 7
+ VEC_LOAD_DATA POLY, .Lpoly_r, 7
+
+ VEC_LOAD D, Tp, 0
+
+ # --- process 4 blocks ---
+
+ srdi. 7, Alen, 6 # 4-blocks loop count
+ beq .L2x
+
+ mtctr 7 # set counter register
+
+ # load table elements
+ li 8, 1*16
+ li 9, 2*16
+ li 10, 3*16
+ lxvd2x H1L+32, 0, Htbl
+ lxvd2x H1M+32, 8, Htbl
+ lxvd2x H2L+32, 9, Htbl
+ lxvd2x H2M+32, 10, Htbl
+ li 7, 4*16
+ li 8, 5*16
+ li 9, 6*16
+ li 10, 7*16
+ lxvd2x H3L+32, 7, Htbl
+ lxvd2x H3M+32, 8, Htbl
+ lxvd2x H4L+32, 9, Htbl
+ lxvd2x H4M+32, 10, Htbl
+
+ li 8, 0x10
+ li 9, 0x20
+ li 10, 0x30
+.align 5
+.L4x_loop:
+ # load input
+ lxvd2x C0+32, 0, AAD
+ lxvd2x C1+32, 8, AAD
+ lxvd2x C2+32, 9, AAD
+ lxvd2x C3+32, 10, AAD
+
+ vperm C0, C0, C0, SWAP_MASK
+ vperm C1, C1, C1, SWAP_MASK
+ vperm C2, C2, C2, SWAP_MASK
+ vperm C3, C3, C3, SWAP_MASK
+
+ # digest combining
+ vxor C0, C0, D
+
+ # polynomial multiplication
+ vpmsumd F2, H3L, C1
+ vpmsumd R2, H3M, C1
+ vpmsumd F3, H2L, C2
+ vpmsumd R3, H2M, C2
+ vpmsumd F4, H1L, C3
+ vpmsumd R4, H1M, C3
+ vpmsumd F, H4L, C0
+ vpmsumd R, H4M, C0
+
+ # deferred recombination of partial products
+ vxor F3, F3, F4
+ vxor R3, R3, R4
+ vxor F, F, F2
+ vxor R, R, R2
+ vxor F, F, F3
+ vxor R, R, R3
+
+ # reduction
+ vpmsumd T, F, POLY
+ vsldoi D, F, F, 8
+ vxor R, R, T
+ vxor D, R, D
+
+ addi AAD, AAD, 0x40
+ bdnz .L4x_loop
+
+ clrldi Alen, Alen, 58
+.L2x:
+ # --- process 2 blocks ---
+
+ srdi. 7, Alen, 5
+ beq .L1x
+
+ # load table elements
+ li 8, 1*16
+ li 9, 2*16
+ li 10, 3*16
+ lxvd2x H1L+32, 0, Htbl
+ lxvd2x H1M+32, 8, Htbl
+ lxvd2x H2L+32, 9, Htbl
+ lxvd2x H2M+32, 10, Htbl
+
+ # load input
+ li 10, 0x10
+ lxvd2x C0+32, 0, AAD
+ lxvd2x C1+32, 10, AAD
+
+ vperm C0, C0, C0, SWAP_MASK
+ vperm C1, C1, C1, SWAP_MASK
+
+ # previous digest combining
+ vxor C0, C0, D
+
+ # polynomial multiplication
+ vpmsumd F2, H1L, C1
+ vpmsumd R2, H1M, C1
+ vpmsumd F, H2L, C0
+ vpmsumd R, H2M, C0
+
+ # deferred recombination of partial products
+ vxor F, F, F2
+ vxor R, R, R2
+
+ # reduction
+ vpmsumd T, F, POLY
+ vsldoi D, F, F, 8
+ vxor R, R, T
+ vxor D, R, D
+
+ addi AAD, AAD, 0x20
+ clrldi Alen, Alen, 59
+.L1x:
+ # --- process 1 block ---
+
+ srdi. 7, Alen, 4
+ beq .Ltail
+
+ # load table elements
+ li 8, 1*16
+ lxvd2x H1L+32, 0, Htbl
+ lxvd2x H1M+32, 8, Htbl
+
+ # load input
+ lxvd2x C0+32, 0, AAD
+
+ vperm C0, C0, C0, SWAP_MASK
+
+ # previous digest combining
+ vxor C0, C0, D
+
+ # polynomial multiplication
+ vpmsumd F, H1L, C0
+ vpmsumd R, H1M, C0
+
+ # reduction
+ vpmsumd T, F, POLY
+ vsldoi D, F, F, 8
+ vxor R, R, T
+ vxor D, R, D
+
+ addi AAD, AAD, 0x10
+ clrldi Alen, Alen, 60
+
+.Ltail:
+ cmpldi Alen, 0
+ beq .Lh_done
+ # --- process the final partial block ---
+
+ # load table elements
+ li 8, 1*16
+ lxvd2x H1L+32, 0, Htbl
+ lxvd2x H1M+32, 8, Htbl
+
+ LOAD_LEN AAD, Alen, 10, 9, 3, 7, 8
+ mtvrd C0, 10
+ mtvrd C1, 9
+ xxmrghd C0+32, C0+32, C1+32
+
+ # previous digest combining
+ vxor C0, C0, D
+
+ # polynomial multiplication
+ vpmsumd F, H1L, C0
+ vpmsumd R, H1M, C0
+
+ # reduction
+ vpmsumd T, F, POLY
+ vsldoi D, F, F, 8
+ vxor R, R, T
+ vxor D, R, D
+.Lh_done:
+ VEC_STORE D, Tp, 0
+
+ # restore non-volatile vector registers
+ addi 7, SP, -16
+ lvx 31, 0, 7
+ addi 7, SP, -32
+ lvx 30, 0, 7
+ addi 7, SP, -48
+ lvx 29, 0, 7
+ addi 7, SP, -64
+ lvx 28, 0, 7
+ blr
+.size ppc_aes_gcmHASH, . - ppc_aes_gcmHASH
+
+################################################################################
+# Generates the final GCM tag
+# void ppc_aes_gcmTAG(uint8_t Htbl[16*8], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
+.globl ppc_aes_gcmTAG
+.type ppc_aes_gcmTAG,@function
+.align 5
+ppc_aes_gcmTAG:
+addis TOCP,12,(.TOC.-ppc_aes_gcmTAG)@ha
+addi TOCP,TOCP,(.TOC.-ppc_aes_gcmTAG)@l
+.localentry ppc_aes_gcmTAG, .-ppc_aes_gcmTAG
+
+.set Htbl, 3
+.set Tp, 4
+.set Mlen, 5
+.set Alen, 6
+.set X0, 7
+.set TAG, 8
+
+.set SWAP_MASK, 0
+.set POLY, 1
+.set D, 2
+.set C0, 3
+.set C1, 4
+.set T, 5
+.set R, 6
+.set F, 7
+.set H1M, 8
+.set H1L, 9
+.set X, 10
+
+ VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
+ VEC_LOAD_DATA POLY, .Lpoly_r, 9
+
+ VEC_LOAD D, Tp, 0
+
+ # load table elements
+ li 9, 1*16
+ lxvd2x H1L+32, 0, Htbl
+ lxvd2x H1M+32, 9, Htbl
+
+ sldi Alen, Alen, 3
+ sldi Mlen, Mlen, 3
+ mtvrd C0, Alen
+ mtvrd C1, Mlen
+ xxmrghd C0+32, C0+32, C1+32
+
+ # previous digest combining
+ vxor C0, C0, D
+
+ # polynomial multiplication
+ vpmsumd F, H1L, C0
+ vpmsumd R, H1M, C0
+
+ # reduction
+ vpmsumd T, F, POLY
+ vsldoi D, F, F, 8
+ vxor R, R, T
+ vxor D, R, D
+
+ lxvd2x X+32, 0, X0
+ vperm D, D, D, SWAP_MASK
+ vxor X, X, D
+ stxvd2x X+32, 0, TAG
+
+ blr
+.size ppc_aes_gcmTAG, . - ppc_aes_gcmTAG
+
+################################################################################
+# Crypt only
+# void ppc_aes_gcmCRYPT(const uint8_t* PT, uint8_t* CT, uint64_t LEN, uint8_t *CTRP, uint32_t *KS, int NR);
+.globl ppc_aes_gcmCRYPT
+.type ppc_aes_gcmCRYPT,@function
+.align 5
+ppc_aes_gcmCRYPT:
+addis TOCP,12,(.TOC.-ppc_aes_gcmCRYPT)@ha
+addi TOCP,TOCP,(.TOC.-ppc_aes_gcmCRYPT)@l
+.localentry ppc_aes_gcmCRYPT, .-ppc_aes_gcmCRYPT
+
+.set PT, 3
+.set CT, 4
+.set LEN, 5
+.set CTRP, 6
+.set KS, 7
+.set NR, 8
+
+.set SWAP_MASK, 0
+.set K, 1
+.set CTR, 2
+.set CTR0, 3
+.set CTR1, 4
+.set CTR2, 5
+.set CTR3, 6
+.set CTR4, 7
+.set CTR5, 8
+.set CTR6, 9
+.set CTR7, 10
+.set ZERO, 11
+.set I1, 12
+.set I2, 13
+.set I3, 14
+.set I4, 15
+.set I5, 16
+.set I6, 17
+.set I7, 18
+.set I8, 19
+.set IN0, 24
+.set IN1, 25
+.set IN2, 26
+.set IN3, 27
+.set IN4, 28
+.set IN5, 29
+.set IN6, 30
+.set IN7, 31
+
+.macro ROUND_8
+ VEC_LOAD_INC K, KS, 10
+ vcipher CTR0, CTR0, K
+ vcipher CTR1, CTR1, K
+ vcipher CTR2, CTR2, K
+ vcipher CTR3, CTR3, K
+ vcipher CTR4, CTR4, K
+ vcipher CTR5, CTR5, K
+ vcipher CTR6, CTR6, K
+ vcipher CTR7, CTR7, K
+.endm
+
+.macro ROUND_4
+ VEC_LOAD_INC K, KS, 10
+ vcipher CTR0, CTR0, K
+ vcipher CTR1, CTR1, K
+ vcipher CTR2, CTR2, K
+ vcipher CTR3, CTR3, K
+.endm
+
+.macro ROUND_2
+ VEC_LOAD_INC K, KS, 10
+ vcipher CTR0, CTR0, K
+ vcipher CTR1, CTR1, K
+.endm
+
+.macro ROUND_1
+ VEC_LOAD_INC K, KS, 10
+ vcipher CTR0, CTR0, K
+.endm
+
+ # store non-volatile general registers
+ std 31,-8(SP);
+ std 30,-16(SP);
+ std 29,-24(SP);
+ std 28,-32(SP);
+ std 27,-40(SP);
+ std 26,-48(SP);
+ std 25,-56(SP);
+
+ # store non-volatile vector registers
+ addi 9, SP, -80
+ stvx 31, 0, 9
+ addi 9, SP, -96
+ stvx 30, 0, 9
+ addi 9, SP, -112
+ stvx 29, 0, 9
+ addi 9, SP, -128
+ stvx 28, 0, 9
+ addi 9, SP, -144
+ stvx 27, 0, 9
+ addi 9, SP, -160
+ stvx 26, 0, 9
+ addi 9, SP, -176
+ stvx 25, 0, 9
+ addi 9, SP, -192
+ stvx 24, 0, 9
+
+ VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
+
+ vxor ZERO, ZERO, ZERO
+ vspltisb I1, 1
+ vspltisb I2, 2
+ vspltisb I3, 3
+ vspltisb I4, 4
+ vspltisb I5, 5
+ vspltisb I6, 6
+ vspltisb I7, 7
+ vspltisb I8, 8
+ vsldoi I1, ZERO, I1, 1
+ vsldoi I2, ZERO, I2, 1
+ vsldoi I3, ZERO, I3, 1
+ vsldoi I4, ZERO, I4, 1
+ vsldoi I5, ZERO, I5, 1
+ vsldoi I6, ZERO, I6, 1
+ vsldoi I7, ZERO, I7, 1
+ vsldoi I8, ZERO, I8, 1
+
+ VEC_LOAD CTR, CTRP, 0
+
+ srdi. 9, LEN, 7
+ beq .Lctr_4x
+
+ mtctr 9
+
+ li 25, 0x10
+ li 26, 0x20
+ li 27, 0x30
+ li 28, 0x40
+ li 29, 0x50
+ li 30, 0x60
+ li 31, 0x70
+
+.align 5
+.L8x_loop:
+ li 10, 0
+ VEC_LOAD_INC K, KS, 10
+
+ vadduwm CTR1, CTR, I1
+ vadduwm CTR2, CTR, I2
+ vadduwm CTR3, CTR, I3
+ vadduwm CTR4, CTR, I4
+ vadduwm CTR5, CTR, I5
+ vadduwm CTR6, CTR, I6
+ vadduwm CTR7, CTR, I7
+
+ vxor CTR0, CTR, K
+ vxor CTR1, CTR1, K
+ vxor CTR2, CTR2, K
+ vxor CTR3, CTR3, K
+ vxor CTR4, CTR4, K
+ vxor CTR5, CTR5, K
+ vxor CTR6, CTR6, K
+ vxor CTR7, CTR7, K
+
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ ROUND_8
+ cmpwi NR, 10
+ beq .Llast_8
+ ROUND_8
+ ROUND_8
+ cmpwi NR, 12
+ beq .Llast_8
+ ROUND_8
+ ROUND_8
+
+.Llast_8:
+ VEC_LOAD K, KS, 10
+ vcipherlast CTR0, CTR0, K
+ vcipherlast CTR1, CTR1, K
+ vcipherlast CTR2, CTR2, K
+ vcipherlast CTR3, CTR3, K
+ vcipherlast CTR4, CTR4, K
+ vcipherlast CTR5, CTR5, K
+ vcipherlast CTR6, CTR6, K
+ vcipherlast CTR7, CTR7, K
+
+ lxvd2x IN0+32, 0, PT
+ lxvd2x IN1+32, 25, PT
+ lxvd2x IN2+32, 26, PT
+ lxvd2x IN3+32, 27, PT
+ lxvd2x IN4+32, 28, PT
+ lxvd2x IN5+32, 29, PT
+ lxvd2x IN6+32, 30, PT
+ lxvd2x IN7+32, 31, PT
+
+ vperm CTR0, CTR0, CTR0, SWAP_MASK
+ vperm CTR1, CTR1, CTR1, SWAP_MASK
+ vperm CTR2, CTR2, CTR2, SWAP_MASK
+ vperm CTR3, CTR3, CTR3, SWAP_MASK
+ vperm CTR4, CTR4, CTR4, SWAP_MASK
+ vperm CTR5, CTR5, CTR5, SWAP_MASK
+ vperm CTR6, CTR6, CTR6, SWAP_MASK
+ vperm CTR7, CTR7, CTR7, SWAP_MASK
+
+ vxor IN0, IN0, CTR0
+ vxor IN1, IN1, CTR1
+ vxor IN2, IN2, CTR2
+ vxor IN3, IN3, CTR3
+ vxor IN4, IN4, CTR4
+ vxor IN5, IN5, CTR5
+ vxor IN6, IN6, CTR6
+ vxor IN7, IN7, CTR7
+
+ stxvd2x IN0+32, 0, CT
+ stxvd2x IN1+32, 25, CT
+ stxvd2x IN2+32, 26, CT
+ stxvd2x IN3+32, 27, CT
+ stxvd2x IN4+32, 28, CT
+ stxvd2x IN5+32, 29, CT
+ stxvd2x IN6+32, 30, CT
+ stxvd2x IN7+32, 31, CT
+
+ vadduwm CTR, CTR, I8
+ addi PT, PT, 0x80
+ addi CT, CT, 0x80
+ bdnz .L8x_loop
+
+ clrldi LEN, LEN, 57
+
+.Lctr_4x:
+ srdi. 9, LEN, 6
+ beq .Lctr_2x
+
+ li 10, 0
+ li 29, 0x10
+ li 30, 0x20
+ li 31, 0x30
+
+ VEC_LOAD_INC K, KS, 10
+
+ vadduwm CTR1, CTR, I1
+ vadduwm CTR2, CTR, I2
+ vadduwm CTR3, CTR, I3
+
+ vxor CTR0, CTR, K
+ vxor CTR1, CTR1, K
+ vxor CTR2, CTR2, K
+ vxor CTR3, CTR3, K
+
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ ROUND_4
+ cmpwi NR, 10
+ beq .Llast_4
+ ROUND_4
+ ROUND_4
+ cmpwi NR, 12
+ beq .Llast_4
+ ROUND_4
+ ROUND_4
+
+.Llast_4:
+ VEC_LOAD K, KS, 10
+ vcipherlast CTR0, CTR0, K
+ vcipherlast CTR1, CTR1, K
+ vcipherlast CTR2, CTR2, K
+ vcipherlast CTR3, CTR3, K
+
+ lxvd2x IN0+32, 0, PT
+ lxvd2x IN1+32, 29, PT
+ lxvd2x IN2+32, 30, PT
+ lxvd2x IN3+32, 31, PT
+
+ vperm CTR0, CTR0, CTR0, SWAP_MASK
+ vperm CTR1, CTR1, CTR1, SWAP_MASK
+ vperm CTR2, CTR2, CTR2, SWAP_MASK
+ vperm CTR3, CTR3, CTR3, SWAP_MASK
+
+ vxor IN0, IN0, CTR0
+ vxor IN1, IN1, CTR1
+ vxor IN2, IN2, CTR2
+ vxor IN3, IN3, CTR3
+
+ stxvd2x IN0+32, 0, CT
+ stxvd2x IN1+32, 29, CT
+ stxvd2x IN2+32, 30, CT
+ stxvd2x IN3+32, 31, CT
+
+ vadduwm CTR, CTR, I4
+ addi PT, PT, 0x40
+ addi CT, CT, 0x40
+
+ clrldi LEN, LEN, 58
+
+.Lctr_2x:
+ srdi. 9, LEN, 5
+ beq .Lctr_1x
+
+ li 10, 0
+ li 31, 0x10
+
+ VEC_LOAD_INC K, KS, 10
+
+ vadduwm CTR1, CTR, I1
+
+ vxor CTR0, CTR, K
+ vxor CTR1, CTR1, K
+
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ ROUND_2
+ cmpwi NR, 10
+ beq .Llast_2
+ ROUND_2
+ ROUND_2
+ cmpwi NR, 12
+ beq .Llast_2
+ ROUND_2
+ ROUND_2
+
+.Llast_2:
+ VEC_LOAD K, KS, 10
+ vcipherlast CTR0, CTR0, K
+ vcipherlast CTR1, CTR1, K
+
+ lxvd2x IN0+32, 0, PT
+ lxvd2x IN1+32, 31, PT
+
+ vperm CTR0, CTR0, CTR0, SWAP_MASK
+ vperm CTR1, CTR1, CTR1, SWAP_MASK
+
+ vxor IN0, IN0, CTR0
+ vxor IN1, IN1, CTR1
+
+ stxvd2x IN0+32, 0, CT
+ stxvd2x IN1+32, 31, CT
+
+ vadduwm CTR, CTR, I2
+ addi PT, PT, 0x20
+ addi CT, CT, 0x20
+
+ clrldi LEN, LEN, 59
+
+.Lctr_1x:
+ srdi. 9, LEN, 4
+ beq .Lctr_tail
+
+ li 10, 0
+
+ VEC_LOAD_INC K, KS, 10
+ vxor CTR0, CTR, K
+
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ cmpwi NR, 10
+ beq .Llast_1
+ ROUND_1
+ ROUND_1
+ cmpwi NR, 12
+ beq .Llast_1
+ ROUND_1
+ ROUND_1
+
+.Llast_1:
+ VEC_LOAD K, KS, 10
+ vcipherlast CTR0, CTR0, K
+
+ lxvd2x IN0+32, 0, PT
+
+ vperm CTR0, CTR0, CTR0, SWAP_MASK
+
+ vxor IN0, IN0, CTR0
+
+ stxvd2x IN0+32, 0, CT
+
+ vadduwm CTR, CTR, I1
+ addi PT, PT, 0x10
+ addi CT, CT, 0x10
+
+ clrldi LEN, LEN, 60
+
+.Lctr_tail:
+ cmpldi LEN, 0
+ beq .Lc_done
+
+ li 10, 0
+
+ VEC_LOAD_INC K, KS, 10
+ vxor CTR0, CTR, K
+
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ ROUND_1
+ cmpwi NR, 10
+ beq .Llast_tail
+ ROUND_1
+ ROUND_1
+ cmpwi NR, 12
+ beq .Llast_tail
+ ROUND_1
+ ROUND_1
+
+.Llast_tail:
+ VEC_LOAD K, KS, 10
+ vcipherlast CTR0, CTR0, K
+
+ LOAD_LEN PT, LEN, 10, 9, 29, 30, 31
+
+ vsldoi CTR1, CTR0, CTR0, 8
+ mfvrd 31, CTR0
+ mfvrd 30, CTR1
+
+ xor 10, 10, 31
+ xor 9, 9, 30
+
+ STORE_LEN CT, LEN, 10, 9, 29, 30, 31
+
+ vadduwm CTR, CTR, I1
+
+.Lc_done:
+ VEC_STORE CTR, CTRP, 0
+
+ # restore non-volatile vector registers
+ addi 9, SP, -80
+ lvx 31, 0, 9
+ addi 9, SP, -96
+ lvx 30, 0, 9
+ addi 9, SP, -112
+ lvx 29, 0, 9
+ addi 9, SP, -128
+ lvx 28, 0, 9
+ addi 9, SP, -144
+ lvx 27, 0, 9
+ addi 9, SP, -160
+ lvx 26, 0, 9
+ addi 9, SP, -176
+ lvx 25, 0, 9
+ addi 9, SP, -192
+ lvx 24, 0, 9
+
+ # restore non-volatile general registers
+ ld 31,-8(SP);
+ ld 30,-16(SP);
+ ld 29,-24(SP);
+ ld 28,-32(SP);
+ ld 27,-40(SP);
+ ld 26,-48(SP);
+ ld 25,-56(SP);
+ blr
+.size ppc_aes_gcmCRYPT, . - ppc_aes_gcmCRYPT
+
+.data
+.align 4
+.Lpoly:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lpoly_r:
+ .byte 0,0,0,0,0,0,0,0xc2,0,0,0,0,0,0,0,0
+.Ldb_bswap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7