178 lines
4 KiB
ArmAsm
178 lines
4 KiB
ArmAsm
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/frame.h>
|
|
|
|
.section .rodata, "a"
|
|
.align 16
|
|
CONSTANTS: .octa 0x6b20657479622d323320646e61707865
|
|
.text
|
|
|
|
/*
|
|
* Very basic SSE2 implementation of ChaCha20. Produces a given positive number
|
|
* of blocks of output with a nonce of 0, taking an input key and 8-byte
|
|
* counter. Importantly does not spill to the stack. Its arguments are:
|
|
*
|
|
* rdi: output bytes
|
|
* rsi: 32-byte key input
|
|
* rdx: 8-byte counter input/output
|
|
* rcx: number of 64-byte blocks to write to output
|
|
*/
|
|
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
|
|
|
|
.set output, %rdi
|
|
.set key, %rsi
|
|
.set counter, %rdx
|
|
.set nblocks, %rcx
|
|
.set i, %al
|
|
/* xmm registers are *not* callee-save. */
|
|
.set temp, %xmm0
|
|
.set state0, %xmm1
|
|
.set state1, %xmm2
|
|
.set state2, %xmm3
|
|
.set state3, %xmm4
|
|
.set copy0, %xmm5
|
|
.set copy1, %xmm6
|
|
.set copy2, %xmm7
|
|
.set copy3, %xmm8
|
|
.set one, %xmm9
|
|
|
|
/* copy0 = "expand 32-byte k" */
|
|
movaps CONSTANTS(%rip),copy0
|
|
/* copy1,copy2 = key */
|
|
movups 0x00(key),copy1
|
|
movups 0x10(key),copy2
|
|
/* copy3 = counter || zero nonce */
|
|
movq 0x00(counter),copy3
|
|
/* one = 1 || 0 */
|
|
movq $1,%rax
|
|
movq %rax,one
|
|
|
|
.Lblock:
|
|
/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
|
|
movdqa copy0,state0
|
|
movdqa copy1,state1
|
|
movdqa copy2,state2
|
|
movdqa copy3,state3
|
|
|
|
movb $10,i
|
|
.Lpermute:
|
|
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
|
|
paddd state1,state0
|
|
pxor state0,state3
|
|
movdqa state3,temp
|
|
pslld $16,temp
|
|
psrld $16,state3
|
|
por temp,state3
|
|
|
|
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
|
|
paddd state3,state2
|
|
pxor state2,state1
|
|
movdqa state1,temp
|
|
pslld $12,temp
|
|
psrld $20,state1
|
|
por temp,state1
|
|
|
|
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
|
|
paddd state1,state0
|
|
pxor state0,state3
|
|
movdqa state3,temp
|
|
pslld $8,temp
|
|
psrld $24,state3
|
|
por temp,state3
|
|
|
|
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
|
|
paddd state3,state2
|
|
pxor state2,state1
|
|
movdqa state1,temp
|
|
pslld $7,temp
|
|
psrld $25,state1
|
|
por temp,state1
|
|
|
|
/* state1[0,1,2,3] = state1[1,2,3,0] */
|
|
pshufd $0x39,state1,state1
|
|
/* state2[0,1,2,3] = state2[2,3,0,1] */
|
|
pshufd $0x4e,state2,state2
|
|
/* state3[0,1,2,3] = state3[3,0,1,2] */
|
|
pshufd $0x93,state3,state3
|
|
|
|
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
|
|
paddd state1,state0
|
|
pxor state0,state3
|
|
movdqa state3,temp
|
|
pslld $16,temp
|
|
psrld $16,state3
|
|
por temp,state3
|
|
|
|
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
|
|
paddd state3,state2
|
|
pxor state2,state1
|
|
movdqa state1,temp
|
|
pslld $12,temp
|
|
psrld $20,state1
|
|
por temp,state1
|
|
|
|
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
|
|
paddd state1,state0
|
|
pxor state0,state3
|
|
movdqa state3,temp
|
|
pslld $8,temp
|
|
psrld $24,state3
|
|
por temp,state3
|
|
|
|
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
|
|
paddd state3,state2
|
|
pxor state2,state1
|
|
movdqa state1,temp
|
|
pslld $7,temp
|
|
psrld $25,state1
|
|
por temp,state1
|
|
|
|
/* state1[0,1,2,3] = state1[3,0,1,2] */
|
|
pshufd $0x93,state1,state1
|
|
/* state2[0,1,2,3] = state2[2,3,0,1] */
|
|
pshufd $0x4e,state2,state2
|
|
/* state3[0,1,2,3] = state3[1,2,3,0] */
|
|
pshufd $0x39,state3,state3
|
|
|
|
decb i
|
|
jnz .Lpermute
|
|
|
|
/* output0 = state0 + copy0 */
|
|
paddd copy0,state0
|
|
movups state0,0x00(output)
|
|
/* output1 = state1 + copy1 */
|
|
paddd copy1,state1
|
|
movups state1,0x10(output)
|
|
/* output2 = state2 + copy2 */
|
|
paddd copy2,state2
|
|
movups state2,0x20(output)
|
|
/* output3 = state3 + copy3 */
|
|
paddd copy3,state3
|
|
movups state3,0x30(output)
|
|
|
|
/* ++copy3.counter */
|
|
paddq one,copy3
|
|
|
|
/* output += 64, --nblocks */
|
|
addq $64,output
|
|
decq nblocks
|
|
jnz .Lblock
|
|
|
|
/* counter = copy3.counter */
|
|
movq copy3,0x00(counter)
|
|
|
|
/* Zero out the potentially sensitive regs, in case nothing uses these again. */
|
|
pxor state0,state0
|
|
pxor state1,state1
|
|
pxor state2,state2
|
|
pxor state3,state3
|
|
pxor copy1,copy1
|
|
pxor copy2,copy2
|
|
pxor temp,temp
|
|
|
|
ret
|
|
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
|