1 files changed, 668 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/chacha20-ppc64le.S b/security/nss/lib/freebl/chacha20-ppc64le.S
new file mode 100644
index 0000000000..487ff830a5
--- /dev/null
+++ b/security/nss/lib/freebl/chacha20-ppc64le.S
@@ -0,0 +1,668 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+# vs0 - vs15              : buffer for xor
+# vs32 - vs47 (v0 - v15)  : 4 "converted" states
+# vs48 - vs51 (v16 - v19) : original state
+# vs52 - vs55 (v20 - v23) : "converted" constants
+# vs56 (v24)              : "converted" counter
+# vs57 (v25)              : increment for "converted" counter
+# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor
+
+#define r0	0
+#define sp	1
+#define r2	2
+#define rSIZE	3
+#define rDST	4
+#define rSRC	5
+#define rKEY	6
+#define rNONCE	7
+#define rCNTR	8
+#define r9	9
+#define r10	10
+#define r11	11
+#define r12	12
+#define r13	13
+#define r14	14
+#define r15	15
+#define r16	16
+#define r17	17
+#define r18	18
+#define r19	19
+#define r20	20
+#define r21	21
+#define r22	22
+#define r23	23
+#define r24	24
+#define r25	25
+#define r26	26
+#define r27	27
+#define r28	28
+#define r29	29
+#define r30	30
+#define r31	31
+
+#define v0	0
+#define v1	1
+#define v2	2
+#define v3	3
+#define v4	4
+#define v5	5
+#define v6	6
+#define v7	7
+#define v8	8
+#define v9	9
+#define v10	10
+#define v11	11
+#define v12	12
+#define v13	13
+#define v14	14
+#define v15	15
+#define v16	16
+#define v17	17
+#define v18	18
+#define v19	19
+#define v20	20
+#define v21	21
+#define v22	22
+#define v23	23
+#define v24	24
+#define v25	25
+#define v26	26
+#define v27	27
+#define v28	28
+#define v29	29
+#define v30	30
+#define v31	31
+
+#define vs0	0
+#define vs1	1
+#define vs2	2
+#define vs3	3
+#define vs4	4
+#define vs5	5
+#define vs6	6
+#define vs7	7
+#define vs8	8
+#define vs9	9
+#define vs10	10
+#define vs11	11
+#define vs12	12
+#define vs13	13
+#define vs14	14
+#define vs15	15
+#define vs16	16
+#define vs17	17
+#define vs18	18
+#define vs19	19
+#define vs20	20
+#define vs21	21
+#define vs22	22
+#define vs23	23
+#define vs24	24
+#define vs25	25
+#define vs26	26
+#define vs27	27
+#define vs28	28
+#define vs29	29
+#define vs30	30
+#define vs31	31
+#define vs32	32
+#define vs33	33
+#define vs34	34
+#define vs35	35
+#define vs36	36
+#define vs37	37
+#define vs38	38
+#define vs39	39
+#define vs40	40
+#define vs41	41
+#define vs42	42
+#define vs43	43
+#define vs44	44
+#define vs45	45
+#define vs46	46
+#define vs47	47
+#define vs48	48
+#define vs49	49
+#define vs50	50
+#define vs51	51
+#define vs52	52
+#define vs53	53
+#define vs54	54
+#define vs55	55
+#define vs56	56
+#define vs57	57
+#define vs58	58
+#define vs59	59
+#define vs60	60
+#define vs61	61
+#define vs62	62
+#define vs63	63
+
+.abiversion 2
+.section ".data"
+.align 5
+lblock:	.skip 256
+cnts0:	.long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+cnts1:	.long 0x61707865, 0x61707865, 0x61707865, 0x61707865
+cnts2:	.long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
+cnts3:	.long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
+cnts4:	.long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
+st4:	.long 0, 0, 0, 0
+cntr:	.long 0, 0, 0, 0
+incr:	.long 4, 4, 4, 4
+rotl1:	.long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD
+rotl2:	.long 12, 12, 12, 12
+rotl3:	.long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC
+rotl4:	.long 7, 7, 7, 7
+
+.section ".text"
+.align 5
+.globl chacha20vsx
+.type chacha20vsx, @function
+chacha20vsx:
+	# prologue
+	addis	2, r12, .TOC.-chacha20vsx@ha
+	addi	2, 2, .TOC.-chacha20vsx@l
+	.localentry	chacha20vsx, .-chacha20vsx
+	std	r14, -8(sp)
+	std	r15, -16(sp)
+	std	r16, -24(sp)
+	std	r17, -32(sp)
+	std	r18, -40(sp)
+	std	r19, -48(sp)
+	std	r20, -56(sp)
+	std	r21, -64(sp)
+	std	r22, -72(sp)
+	std	r23, -80(sp)
+	std	r24, -88(sp)
+	std	r25, -96(sp)
+	std	r26, -104(sp)
+	std	r27, -112(sp)
+	std	r28, -120(sp)
+	std	r29, -128(sp)
+	std	r30, -136(sp)
+	std	r31, -144(sp)
+
+	addi	r14, sp, -160
+
+	li	r16, -16
+	li	r17, -32
+	li	r18, -48
+	li	r19, -64
+	li	r20, -80
+	li	r21, -96
+	li	r22, -112
+	li	r23, -128
+	li	r24, -144
+	li	r25, -160
+	li	r26, -176
+	li	r27, -192
+	li	r28, -208
+
+	# save f14, f15
+	stxvw4x	vs14, 0, r14
+	stxvw4x	vs15, r16, r14
+
+	# save v20 - v31
+	stxvw4x	vs52, r17, r14
+	stxvw4x	vs53, r18, r14
+	stxvw4x	vs54, r19, r14
+	stxvw4x	vs55, r20, r14
+	stxvw4x	vs56, r21, r14
+	stxvw4x	vs57, r22, r14
+	stxvw4x	vs58, r23, r14
+	stxvw4x	vs59, r24, r14
+	stxvw4x	vs60, r25, r14
+	stxvw4x	vs61, r26, r14
+	stxvw4x	vs62, r27, r14
+	stxvw4x	vs63, r28, r14
+
+	# offset in src/dst
+	li	r17, 16
+	li	r18, 32
+	li	r19, 48
+	li	r20, 64
+	li	r21, 80
+	li	r22, 96
+	li	r23, 112
+	li	r24, 128
+	li	r25, 144
+	li	r26, 160
+	li	r27, 176
+	li	r28, 192
+	li	r29, 208
+	li	r30, 224
+	li	r31, 240
+
+	# load const's address
+	addis	r14, 2, cnts0@toc@ha
+	addi	r14, r14, cnts0@toc@l
+
+	# save nonce to st4
+	lwz	r15, 0(rNONCE)
+	stw	r15, 84(r14)
+	lwz	r15, 4(rNONCE)
+	stw	r15, 88(r14)
+	lwz	r15, 8(rNONCE)
+	stw	r15, 92(r14)
+
+	# load state to vectors
+	lxvw4x	vs48, 0, r14
+	lxvw4x	vs49, 0, rKEY
+	lxvw4x	vs50, r17, rKEY
+	lxvw4x	vs51, r21, r14
+
+	# load consts for x4 rounds
+	lxvw4x	vs52, r17, r14
+	lxvw4x	vs53, r18, r14
+	lxvw4x	vs54, r19, r14
+	lxvw4x	vs55, r20, r14
+
+	# counter
+	stw	rCNTR, 96(r14)
+	addi	rCNTR, rCNTR, 1
+	stw	rCNTR, 100(r14)
+	addi	rCNTR, rCNTR, 1
+	stw	rCNTR, 104(r14)
+	addi	rCNTR, rCNTR, 1
+	stw	rCNTR, 108(r14)
+	lxvw4x	vs56, r22, r14
+
+	# load increment
+	lxvw4x	vs57, r23, r14
+
+	# load rotl to vectors
+	lxvw4x	vs60, r24, r14
+	lxvw4x	vs61, r25, r14
+	lxvw4x	vs62, r26, r14
+	lxvw4x	vs63, r27, r14
+
+	# counter for loop = size/256
+	li	r15, 256
+	divdu.	r16, rSIZE, r15
+	beq	lastblock
+	mtctr	r16
+
+mainloop:
+	# init 16 vectors (4 states x4)
+	vor	v0, v20, v20
+	vor	v1, v21, v21
+	vor	v2, v22, v22
+	vor	v3, v23, v23
+	vspltw	v4, v17, v0
+	vspltw	v5, v17, v1
+	vspltw	v6, v17, v2
+	vspltw	v7, v17, v3
+	vspltw	v8, v18, v0
+	vspltw	v9, v18, v1
+	vspltw	v10, v18, v2
+	vspltw	v11, v18, v3
+	vor	v12, v24, v24
+	vspltw	v13, v19, v1
+	vspltw	v14, v19, v2
+	vspltw	v15, v19, v3
+
+.macro _plus a b_y b_x
+	vadduwm	\a,   \a,   \b_y*4+(\b_x)%4
+	vadduwm	\a+1, \a+1, \b_y*4+(\b_x+1)%4
+	vadduwm	\a+2, \a+2, \b_y*4+(\b_x+2)%4
+	vadduwm	\a+3, \a+3, \b_y*4+(\b_x+3)%4
+.endm
+
+.macro _xor a b_y b_x
+	vxor	\a,   \a,   \b_y*4+(\b_x)%4
+	vxor	\a+1, \a+1, \b_y*4+(\b_x+1)%4
+	vxor	\a+2, \a+2, \b_y*4+(\b_x+2)%4
+	vxor	\a+3, \a+3, \b_y*4+(\b_x+3)%4
+.endm
+
+.macro _rotl a b
+	vrlw	\a,   \a,   \b
+	vrlw	\a+1, \a+1, \b
+	vrlw	\a+2, \a+2, \b
+	vrlw	\a+3, \a+3, \b
+.endm
+
+.macro _pxor a b_y b_x c
+	vpermxor	\a,   \a,   \b_y*4+(\b_x)%4,   \c
+	vpermxor	\a+1, \a+1, \b_y*4+(\b_x+1)%4, \c
+	vpermxor	\a+2, \a+2, \b_y*4+(\b_x+2)%4, \c
+	vpermxor	\a+3, \a+3, \b_y*4+(\b_x+3)%4, \c
+.endm
+
+#  00  01  02  03
+#  04  05  06  07
+#  08  09  10  11
+#  12  13  14  15
+.macro doubleround
+	# column round
+	_plus	v0,  v1, v0       # a+=b
+	_pxor	v12, v0, v0, v28  # d^=a; d<<<=16
+	_plus	v8,  v3, v0       # c+=d
+	_xor	v4,  v2, v0       # b^=c
+	_rotl	v4,  v29          # b<<<=12
+	_plus	v0,  v1, v0       # a+=b
+	_pxor	v12, v0, v0, v30  # d^=a; d<<<=8
+	_plus	v8,  v3, v0       # c+=d
+	_xor	v4,  v2, v0       # b^=c
+	_rotl	v4,  v31          # b<<<=7
+
+	# diagonal round
+	_plus	v0,  v1, v1       # a+=b
+	_pxor	v12, v0, v1, v28  # d^=a; d<<<=16
+	_plus	v8,  v3, v1       # c+=d
+	_xor	v4,  v2, v1       # b^=c
+	_rotl	v4,  v29          # b<<<=12
+	_plus	v0,  v1, v1       # a+=b
+	_pxor	v12, v0, v1, v30  # d^=a; d<<<=8
+	_plus	v8,  v3, v1       # c+=d
+	_xor	v4,  v2, v1       # b^=c
+	_rotl	v4,  v31          # b<<<=7
+.endm
+
+	doubleround # 1
+	doubleround # 2
+	doubleround # 3
+	doubleround # 4
+	doubleround # 5
+	doubleround # 6
+	doubleround # 7
+	doubleround # 8
+	doubleround # 9
+	doubleround # 10
+
+	# counter += original counter
+	vadduwm	v12, v12, v24
+
+.macro convert a
+	vmrgew	26, 0+\a, 1+\a
+	vmrgew	27, 2+\a, 3+\a
+	vmrgow	0+\a, 0+\a, 1+\a
+	vmrgow	2+\a, 2+\a, 3+\a
+	xxmrghd	33+\a, 32+\a, 34+\a
+	xxmrgld	35+\a, 32+\a, 34+\a
+	xxmrghd	32+\a, 58, 59
+	xxmrgld	34+\a, 58, 59
+.endm
+
+	convert 0
+	convert 4
+	convert 8
+	convert 12
+
+.macro addition a
+	vadduwm	0+\a, 0+\a, 16
+	vadduwm	4+\a, 4+\a, 17
+	vadduwm	8+\a, 8+\a, 18
+	vadduwm	12+\a, 12+\a, 19
+.endm
+
+	addition 0
+	addition 1
+	addition 2
+	addition 3
+
+	# load text/cipher
+	lxvw4x	vs0, 0, rSRC
+	lxvw4x	vs1, r17, rSRC
+	lxvw4x	vs2, r18, rSRC
+	lxvw4x	vs3, r19, rSRC
+	lxvw4x	vs4, r20, rSRC
+	lxvw4x	vs5, r21, rSRC
+	lxvw4x	vs6, r22, rSRC
+	lxvw4x	vs7, r23, rSRC
+	lxvw4x	vs8, r24, rSRC
+	lxvw4x	vs9, r25, rSRC
+	lxvw4x	vs10, r26, rSRC
+	lxvw4x	vs11, r27, rSRC
+	lxvw4x	vs12, r28, rSRC
+	lxvw4x	vs13, r29, rSRC
+	lxvw4x	vs14, r30, rSRC
+	lxvw4x	vs15, r31, rSRC
+	# xor (encrypt/decrypt)
+	xxlxor	vs0, vs0, vs32
+	xxlxor	vs1, vs1, vs36
+	xxlxor	vs2, vs2, vs40
+	xxlxor	vs3, vs3, vs44
+	xxlxor	vs4, vs4, vs33
+	xxlxor	vs5, vs5, vs37
+	xxlxor	vs6, vs6, vs41
+	xxlxor	vs7, vs7, vs45
+	xxlxor	vs8, vs8, vs34
+	xxlxor	vs9, vs9, vs38
+	xxlxor	vs10, vs10, vs42
+	xxlxor	vs11, vs11, vs46
+	xxlxor	vs12, vs12, vs35
+	xxlxor	vs13, vs13, vs39
+	xxlxor	vs14, vs14, vs43
+	xxlxor	vs15, vs15, vs47
+	# store cipher/text
+	stxvw4x	vs0, 0, rDST
+	stxvw4x	vs1, r17, rDST
+	stxvw4x	vs2, r18, rDST
+	stxvw4x	vs3, r19, rDST
+	stxvw4x	vs4, r20, rDST
+	stxvw4x	vs5, r21, rDST
+	stxvw4x	vs6, r22, rDST
+	stxvw4x	vs7, r23, rDST
+	stxvw4x	vs8, r24, rDST
+	stxvw4x	vs9, r25, rDST
+	stxvw4x	vs10, r26, rDST
+	stxvw4x	vs11, r27, rDST
+	stxvw4x	vs12, r28, rDST
+	stxvw4x	vs13, r29, rDST
+	stxvw4x	vs14, r30, rDST
+	stxvw4x	vs15, r31, rDST
+
+	# src/dst increment
+	addi	rSRC, rSRC, 256
+	addi	rDST, rDST, 256
+
+	# counter increment
+	vadduwm	v24, v24, v25
+
+	bdnz	mainloop
+
+lastblock:
+	# reminder
+	mulld	r16, r16, r15
+	subf.	r16, r16, rSIZE
+
+	# check reminder
+	beq	exitsub
+
+	addi	r14, r14, -256
+	# last block x4
+	# init 16 vectors (4 states x4)
+	vor	v0, v20, v20
+	vor	v1, v21, v21
+	vor	v2, v22, v22
+	vor	v3, v23, v23
+	vspltw	v4, v17, v0
+	vspltw	v5, v17, v1
+	vspltw	v6, v17, v2
+	vspltw	v7, v17, v3
+	vspltw	v8, v18, v0
+	vspltw	v9, v18, v1
+	vspltw	v10, v18, v2
+	vspltw	v11, v18, v3
+	vor	v12, v24, v24
+	vspltw	v13, v19, v1
+	vspltw	v14, v19, v2
+	vspltw	v15, v19, v3
+
+	doubleround # 1
+	doubleround # 2
+	doubleround # 3
+	doubleround # 4
+	doubleround # 5
+	doubleround # 6
+	doubleround # 7
+	doubleround # 8
+	doubleround # 9
+	doubleround # 10
+
+	vadduwm	v12, v12, v24
+
+	convert 0
+	convert 4
+	convert 8
+	convert 12
+
+	addition 0
+	addition 1
+	addition 2
+	addition 3
+
+	# store vectors
+	stxvw4x	vs32, 0, r14
+	stxvw4x	vs36, r17, r14
+	stxvw4x	vs40, r18, r14
+	stxvw4x	vs44, r19, r14
+	stxvw4x	vs33, r20, r14
+	stxvw4x	vs37, r21, r14
+	stxvw4x	vs41, r22, r14
+	stxvw4x	vs45, r23, r14
+	stxvw4x	vs34, r24, r14
+	stxvw4x	vs38, r25, r14
+	stxvw4x	vs42, r26, r14
+	stxvw4x	vs46, r27, r14
+	stxvw4x	vs35, r28, r14
+	stxvw4x	vs39, r29, r14
+	stxvw4x	vs43, r30, r14
+	stxvw4x	vs47, r31, r14
+
+	mtctr	r16
+	addi	rSIZE, r14, -1
+	addi	rSRC, rSRC, -1
+	addi	rDST, rDST, -1
+xorlast:
+	lbzu	r15, 1(rSIZE)
+	lbzu	r16, 1(rSRC)
+	xor	r15, r15, r16
+	stbu	r15, 1(rDST)
+	bdnz	xorlast
+
+	# zeroing last block
+	xxlxor	vs0, vs0, vs0
+	stxvw4x	vs0, 0, r14
+	stxvw4x	vs0, r17, r14
+	stxvw4x	vs0, r18, r14
+	stxvw4x	vs0, r19, r14
+	stxvw4x	vs0, r20, r14
+	stxvw4x	vs0, r21, r14
+	stxvw4x	vs0, r22, r14
+	stxvw4x	vs0, r23, r14
+	stxvw4x	vs0, r24, r14
+	stxvw4x	vs0, r25, r14
+	stxvw4x	vs0, r26, r14
+	stxvw4x	vs0, r27, r14
+	stxvw4x	vs0, r28, r14
+	stxvw4x	vs0, r29, r14
+	stxvw4x	vs0, r30, r14
+	stxvw4x	vs0, r31, r14
+
+exitsub:
+	# zeroing volatile registers
+	xxlxor	vs0, vs0, vs0
+	xxlxor	vs1, vs1, vs1
+	xxlxor	vs2, vs2, vs2
+	xxlxor	vs3, vs3, vs3
+	xxlxor	vs4, vs4, vs4
+	xxlxor	vs5, vs5, vs5
+	xxlxor	vs6, vs6, vs6
+	xxlxor	vs7, vs7, vs7
+	xxlxor	vs8, vs8, vs8
+	xxlxor	vs9, vs9, vs9
+	xxlxor	vs10, vs10, vs10
+	xxlxor	vs11, vs11, vs11
+	xxlxor	vs12, vs12, vs12
+	xxlxor	vs13, vs13, vs13
+
+	xxlxor	vs32, vs32, vs32
+	xxlxor	vs33, vs33, vs33
+	xxlxor	vs34, vs34, vs34
+	xxlxor	vs35, vs35, vs35
+	xxlxor	vs36, vs36, vs36
+	xxlxor	vs37, vs37, vs37
+	xxlxor	vs38, vs38, vs38
+	xxlxor	vs39, vs39, vs39
+	xxlxor	vs40, vs40, vs40
+	xxlxor	vs41, vs41, vs41
+	xxlxor	vs42, vs42, vs42
+	xxlxor	vs43, vs43, vs43
+	xxlxor	vs44, vs44, vs44
+	xxlxor	vs45, vs45, vs45
+	xxlxor	vs46, vs46, vs46
+	xxlxor	vs47, vs47, vs47
+	xxlxor	vs48, vs48, vs48
+	xxlxor	vs49, vs49, vs49
+	xxlxor	vs50, vs50, vs50
+	xxlxor	vs51, vs51, vs51
+
+	li	rSIZE, 0
+	li	rDST, 0
+	li	rSRC, 0
+	li	rKEY, 0
+	li	rNONCE, 0
+	li	rCNTR, 0
+
+	# epilogue
+	addi	r14, sp, -160
+
+	li	r16, -16
+	li	r17, -32
+	li	r18, -48
+	li	r19, -64
+	li	r20, -80
+	li	r21, -96
+	li	r22, -112
+	li	r23, -128
+	li	r24, -144
+	li	r25, -160
+	li	r26, -176
+	li	r27, -192
+	li	r28, -208
+
+	# load f14, f15
+	lxvw4x	vs14, 0, r14
+	lxvw4x	vs15, r16, r14
+
+	# load v20 - v31
+	lxvw4x	vs52, r17, r14
+	lxvw4x	vs53, r18, r14
+	lxvw4x	vs54, r19, r14
+	lxvw4x	vs55, r20, r14
+	lxvw4x	vs56, r21, r14
+	lxvw4x	vs57, r22, r14
+	lxvw4x	vs58, r23, r14
+	lxvw4x	vs59, r24, r14
+	lxvw4x	vs60, r25, r14
+	lxvw4x	vs61, r26, r14
+	lxvw4x	vs62, r27, r14
+	lxvw4x	vs63, r28, r14
+
+	ld	r14, -8(sp)
+	ld	r15, -16(sp)
+	ld	r16, -24(sp)
+	ld	r17, -32(sp)
+	ld	r18, -40(sp)
+	ld	r19, -48(sp)
+	ld	r20, -56(sp)
+	ld	r21, -64(sp)
+	ld	r22, -72(sp)
+	ld	r23, -80(sp)
+	ld	r24, -88(sp)
+	ld	r25, -96(sp)
+	ld	r26, -104(sp)
+	ld	r27, -112(sp)
+	ld	r28, -120(sp)
+	ld	r29, -128(sp)
+	ld	r30, -136(sp)
+	ld	r31, -144(sp)
+
+	blr