diff options
Diffstat (limited to 'security/nss/lib/freebl/chacha20-ppc64le.S')
-rw-r--r-- | security/nss/lib/freebl/chacha20-ppc64le.S | 668 |
1 files changed, 668 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/chacha20-ppc64le.S b/security/nss/lib/freebl/chacha20-ppc64le.S new file mode 100644 index 0000000000..487ff830a5 --- /dev/null +++ b/security/nss/lib/freebl/chacha20-ppc64le.S @@ -0,0 +1,668 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +# vs0 - vs15 : buffer for xor +# vs32 - vs47 (v0 - v15) : 4 "converted" states +# vs48 - vs51 (v16 - v19) : original state +# vs52 - vs55 (v20 - v23) : "converted" constants +# vs56 (v24) : "converted" counter +# vs57 (v25) : increment for "converted" counter +# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor + +#define r0 0 +#define sp 1 +#define r2 2 +#define rSIZE 3 +#define rDST 4 +#define rSRC 5 +#define rKEY 6 +#define rNONCE 7 +#define rCNTR 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 + +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#define vs14 14 +#define vs15 15 +#define vs16 16 +#define vs17 17 +#define vs18 18 +#define vs19 19 +#define vs20 20 +#define vs21 21 +#define vs22 22 +#define vs23 23 +#define vs24 24 +#define vs25 25 +#define vs26 26 +#define vs27 27 +#define vs28 28 +#define vs29 29 +#define vs30 30 +#define vs31 31 +#define vs32 32 +#define vs33 33 +#define vs34 34 +#define vs35 35 +#define vs36 36 +#define vs37 37 +#define vs38 38 +#define vs39 39 +#define vs40 40 +#define vs41 41 +#define vs42 42 +#define vs43 43 +#define vs44 44 +#define vs45 45 +#define vs46 46 +#define vs47 47 +#define vs48 48 +#define vs49 49 +#define vs50 50 +#define vs51 51 +#define vs52 52 +#define vs53 53 +#define vs54 54 +#define vs55 55 +#define vs56 56 +#define vs57 57 +#define vs58 58 +#define vs59 59 +#define vs60 60 +#define vs61 61 +#define vs62 62 +#define vs63 63 + +.abiversion 2 +.section ".data" +.align 5 +lblock: .skip 256 +cnts0: .long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 +cnts1: .long 0x61707865, 0x61707865, 0x61707865, 0x61707865 +cnts2: .long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e +cnts3: .long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32 +cnts4: .long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574 +st4: .long 0, 0, 0, 0 +cntr: .long 0, 0, 0, 0 +incr: .long 4, 4, 4, 4 +rotl1: .long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD +rotl2: .long 12, 12, 12, 12 +rotl3: .long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC +rotl4: .long 7, 7, 7, 7 + +.section ".text" +.align 5 +.globl chacha20vsx +.type chacha20vsx, @function +chacha20vsx: + # prologue + addis 2, r12, .TOC.-chacha20vsx@ha + addi 2, 2, .TOC.-chacha20vsx@l + .localentry chacha20vsx, .-chacha20vsx + std r14, -8(sp) + std r15, -16(sp) + std r16, -24(sp) + std r17, -32(sp) + std r18, -40(sp) + std r19, -48(sp) + std r20, -56(sp) + std r21, -64(sp) + std r22, -72(sp) + std r23, -80(sp) + std r24, -88(sp) + std r25, -96(sp) + std r26, -104(sp) + std r27, -112(sp) + std r28, -120(sp) + std r29, -128(sp) + std r30, -136(sp) + std r31, -144(sp) + + addi r14, sp, -160 + + li r16, -16 + li r17, -32 + li r18, -48 + li r19, -64 + li r20, -80 + li r21, -96 + li r22, -112 + li r23, -128 + li r24, -144 + li r25, -160 + li r26, -176 + li r27, -192 + li r28, -208 + + # save f14, f15 + stxvw4x vs14, 0, r14 + stxvw4x vs15, r16, r14 + + # save v20 - v31 + stxvw4x vs52, r17, r14 + stxvw4x vs53, r18, r14 + stxvw4x vs54, r19, r14 + stxvw4x vs55, r20, r14 + stxvw4x vs56, r21, r14 + stxvw4x vs57, r22, r14 + stxvw4x vs58, r23, r14 + stxvw4x vs59, r24, r14 + stxvw4x vs60, r25, r14 + stxvw4x vs61, r26, r14 + stxvw4x vs62, r27, r14 + stxvw4x vs63, r28, r14 + + # offset in src/dst + li r17, 16 + li r18, 32 + li r19, 48 + li r20, 64 + li r21, 80 + li r22, 96 + li r23, 112 + li r24, 128 + li r25, 144 + li r26, 160 + li r27, 176 + li r28, 192 + li r29, 208 + li r30, 224 + li r31, 240 + + # load const's address + addis r14, 2, cnts0@toc@ha + addi r14, r14, cnts0@toc@l + + # save nonce to st4 + lwz r15, 0(rNONCE) + stw r15, 84(r14) + lwz r15, 4(rNONCE) + stw r15, 88(r14) + lwz r15, 8(rNONCE) + stw r15, 92(r14) + + # load state to vectors + lxvw4x vs48, 0, r14 + lxvw4x vs49, 0, rKEY + lxvw4x vs50, r17, rKEY + lxvw4x vs51, r21, r14 + + # load consts for x4 rounds + lxvw4x vs52, r17, r14 + lxvw4x vs53, r18, r14 + lxvw4x vs54, r19, r14 + lxvw4x vs55, r20, r14 + + # counter + stw rCNTR, 96(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 100(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 104(r14) + addi rCNTR, rCNTR, 1 + stw rCNTR, 108(r14) + lxvw4x vs56, r22, r14 + + # load increment + lxvw4x vs57, r23, r14 + + # load rotl to vectors + lxvw4x vs60, r24, r14 + lxvw4x vs61, r25, r14 + lxvw4x vs62, r26, r14 + lxvw4x vs63, r27, r14 + + # counter for loop = size/256 + li r15, 256 + divdu. r16, rSIZE, r15 + beq lastblock + mtctr r16 + +mainloop: + # init 16 vectors (4 states x4) + vor v0, v20, v20 + vor v1, v21, v21 + vor v2, v22, v22 + vor v3, v23, v23 + vspltw v4, v17, v0 + vspltw v5, v17, v1 + vspltw v6, v17, v2 + vspltw v7, v17, v3 + vspltw v8, v18, v0 + vspltw v9, v18, v1 + vspltw v10, v18, v2 + vspltw v11, v18, v3 + vor v12, v24, v24 + vspltw v13, v19, v1 + vspltw v14, v19, v2 + vspltw v15, v19, v3 + +.macro _plus a b_y b_x + vadduwm \a, \a, \b_y*4+(\b_x)%4 + vadduwm \a+1, \a+1, \b_y*4+(\b_x+1)%4 + vadduwm \a+2, \a+2, \b_y*4+(\b_x+2)%4 + vadduwm \a+3, \a+3, \b_y*4+(\b_x+3)%4 +.endm + +.macro _xor a b_y b_x + vxor \a, \a, \b_y*4+(\b_x)%4 + vxor \a+1, \a+1, \b_y*4+(\b_x+1)%4 + vxor \a+2, \a+2, \b_y*4+(\b_x+2)%4 + vxor \a+3, \a+3, \b_y*4+(\b_x+3)%4 +.endm + +.macro _rotl a b + vrlw \a, \a, \b + vrlw \a+1, \a+1, \b + vrlw \a+2, \a+2, \b + vrlw \a+3, \a+3, \b +.endm + +.macro _pxor a b_y b_x c + vpermxor \a, \a, \b_y*4+(\b_x)%4, \c + vpermxor \a+1, \a+1, \b_y*4+(\b_x+1)%4, \c + vpermxor \a+2, \a+2, \b_y*4+(\b_x+2)%4, \c + vpermxor \a+3, \a+3, \b_y*4+(\b_x+3)%4, \c +.endm + +# 00 01 02 03 +# 04 05 06 07 +# 08 09 10 11 +# 12 13 14 15 +.macro doubleround + # column round + _plus v0, v1, v0 # a+=b + _pxor v12, v0, v0, v28 # d^=a; d<<<=16 + _plus v8, v3, v0 # c+=d + _xor v4, v2, v0 # b^=c + _rotl v4, v29 # b<<<=12 + _plus v0, v1, v0 # a+=b + _pxor v12, v0, v0, v30 # d^=a; d<<<=8 + _plus v8, v3, v0 # c+=d + _xor v4, v2, v0 # b^=c + _rotl v4, v31 # b<<<=7 + + # diagonal round + _plus v0, v1, v1 # a+=b + _pxor v12, v0, v1, v28 # d^=a; d<<<=16 + _plus v8, v3, v1 # c+=d + _xor v4, v2, v1 # b^=c + _rotl v4, v29 # b<<<=12 + _plus v0, v1, v1 # a+=b + _pxor v12, v0, v1, v30 # d^=a; d<<<=8 + _plus v8, v3, v1 # c+=d + _xor v4, v2, v1 # b^=c + _rotl v4, v31 # b<<<=7 +.endm + + doubleround # 1 + doubleround # 2 + doubleround # 3 + doubleround # 4 + doubleround # 5 + doubleround # 6 + doubleround # 7 + doubleround # 8 + doubleround # 9 + doubleround # 10 + + # counter += original counter + vadduwm v12, v12, v24 + +.macro convert a + vmrgew 26, 0+\a, 1+\a + vmrgew 27, 2+\a, 3+\a + vmrgow 0+\a, 0+\a, 1+\a + vmrgow 2+\a, 2+\a, 3+\a + xxmrghd 33+\a, 32+\a, 34+\a + xxmrgld 35+\a, 32+\a, 34+\a + xxmrghd 32+\a, 58, 59 + xxmrgld 34+\a, 58, 59 +.endm + + convert 0 + convert 4 + convert 8 + convert 12 + +.macro addition a + vadduwm 0+\a, 0+\a, 16 + vadduwm 4+\a, 4+\a, 17 + vadduwm 8+\a, 8+\a, 18 + vadduwm 12+\a, 12+\a, 19 +.endm + + addition 0 + addition 1 + addition 2 + addition 3 + + # load text/cipher + lxvw4x vs0, 0, rSRC + lxvw4x vs1, r17, rSRC + lxvw4x vs2, r18, rSRC + lxvw4x vs3, r19, rSRC + lxvw4x vs4, r20, rSRC + lxvw4x vs5, r21, rSRC + lxvw4x vs6, r22, rSRC + lxvw4x vs7, r23, rSRC + lxvw4x vs8, r24, rSRC + lxvw4x vs9, r25, rSRC + lxvw4x vs10, r26, rSRC + lxvw4x vs11, r27, rSRC + lxvw4x vs12, r28, rSRC + lxvw4x vs13, r29, rSRC + lxvw4x vs14, r30, rSRC + lxvw4x vs15, r31, rSRC + # xor (encrypt/decrypt) + xxlxor vs0, vs0, vs32 + xxlxor vs1, vs1, vs36 + xxlxor vs2, vs2, vs40 + xxlxor vs3, vs3, vs44 + xxlxor vs4, vs4, vs33 + xxlxor vs5, vs5, vs37 + xxlxor vs6, vs6, vs41 + xxlxor vs7, vs7, vs45 + xxlxor vs8, vs8, vs34 + xxlxor vs9, vs9, vs38 + xxlxor vs10, vs10, vs42 + xxlxor vs11, vs11, vs46 + xxlxor vs12, vs12, vs35 + xxlxor vs13, vs13, vs39 + xxlxor vs14, vs14, vs43 + xxlxor vs15, vs15, vs47 + # store cipher/text + stxvw4x vs0, 0, rDST + stxvw4x vs1, r17, rDST + stxvw4x vs2, r18, rDST + stxvw4x vs3, r19, rDST + stxvw4x vs4, r20, rDST + stxvw4x vs5, r21, rDST + stxvw4x vs6, r22, rDST + stxvw4x vs7, r23, rDST + stxvw4x vs8, r24, rDST + stxvw4x vs9, r25, rDST + stxvw4x vs10, r26, rDST + stxvw4x vs11, r27, rDST + stxvw4x vs12, r28, rDST + stxvw4x vs13, r29, rDST + stxvw4x vs14, r30, rDST + stxvw4x vs15, r31, rDST + + # src/dst increment + addi rSRC, rSRC, 256 + addi rDST, rDST, 256 + + # counter increment + vadduwm v24, v24, v25 + + bdnz mainloop + +lastblock: + # reminder + mulld r16, r16, r15 + subf. r16, r16, rSIZE + + # check reminder + beq exitsub + + addi r14, r14, -256 + # last block x4 + # init 16 vectors (4 states x4) + vor v0, v20, v20 + vor v1, v21, v21 + vor v2, v22, v22 + vor v3, v23, v23 + vspltw v4, v17, v0 + vspltw v5, v17, v1 + vspltw v6, v17, v2 + vspltw v7, v17, v3 + vspltw v8, v18, v0 + vspltw v9, v18, v1 + vspltw v10, v18, v2 + vspltw v11, v18, v3 + vor v12, v24, v24 + vspltw v13, v19, v1 + vspltw v14, v19, v2 + vspltw v15, v19, v3 + + doubleround # 1 + doubleround # 2 + doubleround # 3 + doubleround # 4 + doubleround # 5 + doubleround # 6 + doubleround # 7 + doubleround # 8 + doubleround # 9 + doubleround # 10 + + vadduwm v12, v12, v24 + + convert 0 + convert 4 + convert 8 + convert 12 + + addition 0 + addition 1 + addition 2 + addition 3 + + # store vectors + stxvw4x vs32, 0, r14 + stxvw4x vs36, r17, r14 + stxvw4x vs40, r18, r14 + stxvw4x vs44, r19, r14 + stxvw4x vs33, r20, r14 + stxvw4x vs37, r21, r14 + stxvw4x vs41, r22, r14 + stxvw4x vs45, r23, r14 + stxvw4x vs34, r24, r14 + stxvw4x vs38, r25, r14 + stxvw4x vs42, r26, r14 + stxvw4x vs46, r27, r14 + stxvw4x vs35, r28, r14 + stxvw4x vs39, r29, r14 + stxvw4x vs43, r30, r14 + stxvw4x vs47, r31, r14 + + mtctr r16 + addi rSIZE, r14, -1 + addi rSRC, rSRC, -1 + addi rDST, rDST, -1 +xorlast: + lbzu r15, 1(rSIZE) + lbzu r16, 1(rSRC) + xor r15, r15, r16 + stbu r15, 1(rDST) + bdnz xorlast + + # zeroing last block + xxlxor vs0, vs0, vs0 + stxvw4x vs0, 0, r14 + stxvw4x vs0, r17, r14 + stxvw4x vs0, r18, r14 + stxvw4x vs0, r19, r14 + stxvw4x vs0, r20, r14 + stxvw4x vs0, r21, r14 + stxvw4x vs0, r22, r14 + stxvw4x vs0, r23, r14 + stxvw4x vs0, r24, r14 + stxvw4x vs0, r25, r14 + stxvw4x vs0, r26, r14 + stxvw4x vs0, r27, r14 + stxvw4x vs0, r28, r14 + stxvw4x vs0, r29, r14 + stxvw4x vs0, r30, r14 + stxvw4x vs0, r31, r14 + +exitsub: + # zeroing volatile registers + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + xxlxor vs8, vs8, vs8 + xxlxor vs9, vs9, vs9 + xxlxor vs10, vs10, vs10 + xxlxor vs11, vs11, vs11 + xxlxor vs12, vs12, vs12 + xxlxor vs13, vs13, vs13 + + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + li rSIZE, 0 + li rDST, 0 + li rSRC, 0 + li rKEY, 0 + li rNONCE, 0 + li rCNTR, 0 + + # epilogue + addi r14, sp, -160 + + li r16, -16 + li r17, -32 + li r18, -48 + li r19, -64 + li r20, -80 + li r21, -96 + li r22, -112 + li r23, -128 + li r24, -144 + li r25, -160 + li r26, -176 + li r27, -192 + li r28, -208 + + # load f14, f15 + lxvw4x vs14, 0, r14 + lxvw4x vs15, r16, r14 + + # load v20 - v31 + lxvw4x vs52, r17, r14 + lxvw4x vs53, r18, r14 + lxvw4x vs54, r19, r14 + lxvw4x vs55, r20, r14 + lxvw4x vs56, r21, r14 + lxvw4x vs57, r22, r14 + lxvw4x vs58, r23, r14 + lxvw4x vs59, r24, r14 + lxvw4x vs60, r25, r14 + lxvw4x vs61, r26, r14 + lxvw4x vs62, r27, r14 + lxvw4x vs63, r28, r14 + + ld r14, -8(sp) + ld r15, -16(sp) + ld r16, -24(sp) + ld r17, -32(sp) + ld r18, -40(sp) + ld r19, -48(sp) + ld r20, -56(sp) + ld r21, -64(sp) + ld r22, -72(sp) + ld r23, -80(sp) + ld r24, -88(sp) + ld r25, -96(sp) + ld r26, -104(sp) + ld r27, -112(sp) + ld r28, -120(sp) + ld r29, -128(sp) + ld r30, -136(sp) + ld r31, -144(sp) + + blr |