diff options
Diffstat (limited to '')
-rw-r--r-- | src/libcryptobox/chacha20/avx.S | 614 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/avx2.S | 1018 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/chacha.c | 262 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/chacha.h | 87 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/constants.S | 6 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/ref.c | 272 | ||||
-rw-r--r-- | src/libcryptobox/chacha20/sse2.S | 734 |
7 files changed, 2993 insertions, 0 deletions
diff --git a/src/libcryptobox/chacha20/avx.S b/src/libcryptobox/chacha20/avx.S new file mode 100644 index 0000000..7689b84 --- /dev/null +++ b/src/libcryptobox/chacha20/avx.S @@ -0,0 +1,614 @@ +#include "../macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN chacha_blocks_avx +chacha_blocks_avx_local: +pushq %rbx +pushq %rbp +movq %rsp, %rbp +andq $~63, %rsp +subq $512, %rsp +LOAD_VAR_PIC chacha_constants, %rax +vmovdqa 0(%rax), %xmm8 +vmovdqa 16(%rax), %xmm6 +vmovdqa 32(%rax), %xmm7 +vmovdqu 0(%rdi), %xmm9 +vmovdqu 16(%rdi), %xmm10 +vmovdqu 32(%rdi), %xmm11 +movq 48(%rdi), %rax +movq $1, %r9 +vmovdqa %xmm8, 0(%rsp) +vmovdqa %xmm9, 16(%rsp) +vmovdqa %xmm10, 32(%rsp) +vmovdqa %xmm11, 48(%rsp) +vmovdqa %xmm6, 80(%rsp) +vmovdqa %xmm7, 96(%rsp) +movq %rax, 64(%rsp) +cmpq $256, %rcx +jb chacha_blocks_avx_below256 +vpshufd $0x00, %xmm8, %xmm0 +vpshufd $0x55, %xmm8, %xmm1 +vpshufd $0xaa, %xmm8, %xmm2 +vpshufd $0xff, %xmm8, %xmm3 +vmovdqa %xmm0, 128(%rsp) +vmovdqa %xmm1, 144(%rsp) +vmovdqa %xmm2, 160(%rsp) +vmovdqa %xmm3, 176(%rsp) +vpshufd $0x00, %xmm9, %xmm0 +vpshufd $0x55, %xmm9, %xmm1 +vpshufd $0xaa, %xmm9, %xmm2 +vpshufd $0xff, %xmm9, %xmm3 +vmovdqa %xmm0, 192(%rsp) +vmovdqa %xmm1, 208(%rsp) +vmovdqa %xmm2, 224(%rsp) +vmovdqa %xmm3, 240(%rsp) +vpshufd $0x00, %xmm10, %xmm0 +vpshufd $0x55, %xmm10, %xmm1 +vpshufd $0xaa, %xmm10, %xmm2 +vpshufd $0xff, %xmm10, %xmm3 +vmovdqa %xmm0, 256(%rsp) +vmovdqa %xmm1, 272(%rsp) +vmovdqa %xmm2, 288(%rsp) +vmovdqa %xmm3, 304(%rsp) +vpshufd $0xaa, %xmm11, %xmm0 +vpshufd $0xff, %xmm11, %xmm1 +vmovdqa %xmm0, 352(%rsp) +vmovdqa %xmm1, 368(%rsp) +jmp chacha_blocks_avx_atleast256 +.p2align 6,,63 +nop +nop +nop +nop +nop +chacha_blocks_avx_atleast256: +movq 48(%rsp), %rax +leaq 1(%rax), %r8 +leaq 2(%rax), %r9 +leaq 3(%rax), %r10 +leaq 4(%rax), %rbx +movl %eax, 320(%rsp) +movl %r8d, 4+320(%rsp) +movl %r9d, 8+320(%rsp) +movl %r10d, 12+320(%rsp) +shrq $32, %rax +shrq $32, %r8 +shrq $32, %r9 +shrq $32, %r10 +movl %eax, 336(%rsp) +movl %r8d, 4+336(%rsp) +movl %r9d, 8+336(%rsp) +movl %r10d, 12+336(%rsp) +movq %rbx, 48(%rsp) +movq 64(%rsp), %rax +vmovdqa 128(%rsp), %xmm0 +vmovdqa 144(%rsp), %xmm1 +vmovdqa 160(%rsp), %xmm2 +vmovdqa 176(%rsp), %xmm3 +vmovdqa 192(%rsp), %xmm4 +vmovdqa 208(%rsp), %xmm5 +vmovdqa 224(%rsp), %xmm6 +vmovdqa 240(%rsp), %xmm7 +vmovdqa 256(%rsp), %xmm8 +vmovdqa 272(%rsp), %xmm9 +vmovdqa 288(%rsp), %xmm10 +vmovdqa 304(%rsp), %xmm11 +vmovdqa 320(%rsp), %xmm12 +vmovdqa 336(%rsp), %xmm13 +vmovdqa 352(%rsp), %xmm14 +vmovdqa 368(%rsp), %xmm15 +chacha_blocks_avx_mainloop1: +vpaddd %xmm0, %xmm4, %xmm0 +vpaddd %xmm1, %xmm5, %xmm1 +vpxor %xmm12, %xmm0, %xmm12 +vpxor %xmm13, %xmm1, %xmm13 +vpaddd %xmm2, %xmm6, %xmm2 +vpaddd %xmm3, %xmm7, %xmm3 +vpxor %xmm14, %xmm2, %xmm14 +vpxor %xmm15, %xmm3, %xmm15 +vpshufb 80(%rsp), %xmm12, %xmm12 +vpshufb 80(%rsp), %xmm13, %xmm13 +vpaddd %xmm8, %xmm12, %xmm8 +vpaddd %xmm9, %xmm13, %xmm9 +vpshufb 80(%rsp), %xmm14, %xmm14 +vpshufb 80(%rsp), %xmm15, %xmm15 +vpaddd %xmm10, %xmm14, %xmm10 +vpaddd %xmm11, %xmm15, %xmm11 +vmovdqa %xmm12, 112(%rsp) +vpxor %xmm4, %xmm8, %xmm4 +vpxor %xmm5, %xmm9, %xmm5 +vpslld $ 12, %xmm4, %xmm12 +vpsrld $20, %xmm4, %xmm4 +vpxor %xmm4, %xmm12, %xmm4 +vpslld $ 12, %xmm5, %xmm12 +vpsrld $20, %xmm5, %xmm5 +vpxor %xmm5, %xmm12, %xmm5 +vpxor %xmm6, %xmm10, %xmm6 +vpxor %xmm7, %xmm11, %xmm7 +vpslld $ 12, %xmm6, %xmm12 +vpsrld $20, %xmm6, %xmm6 +vpxor %xmm6, %xmm12, %xmm6 +vpslld $ 12, %xmm7, %xmm12 +vpsrld $20, %xmm7, %xmm7 +vpxor %xmm7, %xmm12, %xmm7 +vpaddd %xmm0, %xmm4, %xmm0 +vpaddd %xmm1, %xmm5, %xmm1 +vpxor 112(%rsp), %xmm0, %xmm12 +vpxor %xmm13, %xmm1, %xmm13 +vpaddd %xmm2, %xmm6, %xmm2 +vpaddd %xmm3, %xmm7, %xmm3 +vpxor %xmm14, %xmm2, %xmm14 +vpxor %xmm15, %xmm3, %xmm15 +vpshufb 96(%rsp), %xmm12, %xmm12 +vpshufb 96(%rsp), %xmm13, %xmm13 +vpaddd %xmm8, %xmm12, %xmm8 +vpaddd %xmm9, %xmm13, %xmm9 +vpshufb 96(%rsp), %xmm14, %xmm14 +vpshufb 96(%rsp), %xmm15, %xmm15 +vpaddd %xmm10, %xmm14, %xmm10 +vpaddd %xmm11, %xmm15, %xmm11 +vmovdqa %xmm12, 112(%rsp) +vpxor %xmm4, %xmm8, %xmm4 +vpxor %xmm5, %xmm9, %xmm5 +vpslld $ 7, %xmm4, %xmm12 +vpsrld $25, %xmm4, %xmm4 +vpxor %xmm4, %xmm12, %xmm4 +vpslld $ 7, %xmm5, %xmm12 +vpsrld $25, %xmm5, %xmm5 +vpxor %xmm5, %xmm12, %xmm5 +vpxor %xmm6, %xmm10, %xmm6 +vpxor %xmm7, %xmm11, %xmm7 +vpslld $ 7, %xmm6, %xmm12 +vpsrld $25, %xmm6, %xmm6 +vpxor %xmm6, %xmm12, %xmm6 +vpslld $ 7, %xmm7, %xmm12 +vpsrld $25, %xmm7, %xmm7 +vpxor %xmm7, %xmm12, %xmm7 +vpaddd %xmm0, %xmm5, %xmm0 +vpaddd %xmm1, %xmm6, %xmm1 +vpxor %xmm15, %xmm0, %xmm15 +vpxor 112(%rsp), %xmm1, %xmm12 +vpaddd %xmm2, %xmm7, %xmm2 +vpaddd %xmm3, %xmm4, %xmm3 +vpxor %xmm13, %xmm2, %xmm13 +vpxor %xmm14, %xmm3, %xmm14 +vpshufb 80(%rsp), %xmm15, %xmm15 +vpshufb 80(%rsp), %xmm12, %xmm12 +vpaddd %xmm10, %xmm15, %xmm10 +vpaddd %xmm11, %xmm12, %xmm11 +vpshufb 80(%rsp), %xmm13, %xmm13 +vpshufb 80(%rsp), %xmm14, %xmm14 +vpaddd %xmm8, %xmm13, %xmm8 +vpaddd %xmm9, %xmm14, %xmm9 +vmovdqa %xmm15, 112(%rsp) +vpxor %xmm5, %xmm10, %xmm5 +vpxor %xmm6, %xmm11, %xmm6 +vpslld $ 12, %xmm5, %xmm15 +vpsrld $20, %xmm5, %xmm5 +vpxor %xmm5, %xmm15, %xmm5 +vpslld $ 12, %xmm6, %xmm15 +vpsrld $20, %xmm6, %xmm6 +vpxor %xmm6, %xmm15, %xmm6 +vpxor %xmm7, %xmm8, %xmm7 +vpxor %xmm4, %xmm9, %xmm4 +vpslld $ 12, %xmm7, %xmm15 +vpsrld $20, %xmm7, %xmm7 +vpxor %xmm7, %xmm15, %xmm7 +vpslld $ 12, %xmm4, %xmm15 +vpsrld $20, %xmm4, %xmm4 +vpxor %xmm4, %xmm15, %xmm4 +vpaddd %xmm0, %xmm5, %xmm0 +vpaddd %xmm1, %xmm6, %xmm1 +vpxor 112(%rsp), %xmm0, %xmm15 +vpxor %xmm12, %xmm1, %xmm12 +vpaddd %xmm2, %xmm7, %xmm2 +vpaddd %xmm3, %xmm4, %xmm3 +vpxor %xmm13, %xmm2, %xmm13 +vpxor %xmm14, %xmm3, %xmm14 +vpshufb 96(%rsp), %xmm15, %xmm15 +vpshufb 96(%rsp), %xmm12, %xmm12 +vpaddd %xmm10, %xmm15, %xmm10 +vpaddd %xmm11, %xmm12, %xmm11 +vpshufb 96(%rsp), %xmm13, %xmm13 +vpshufb 96(%rsp), %xmm14, %xmm14 +vpaddd %xmm8, %xmm13, %xmm8 +vpaddd %xmm9, %xmm14, %xmm9 +vmovdqa %xmm15, 112(%rsp) +vpxor %xmm5, %xmm10, %xmm5 +vpxor %xmm6, %xmm11, %xmm6 +vpslld $ 7, %xmm5, %xmm15 +vpsrld $25, %xmm5, %xmm5 +vpxor %xmm5, %xmm15, %xmm5 +vpslld $ 7, %xmm6, %xmm15 +vpsrld $25, %xmm6, %xmm6 +vpxor %xmm6, %xmm15, %xmm6 +vpxor %xmm7, %xmm8, %xmm7 +vpxor %xmm4, %xmm9, %xmm4 +vpslld $ 7, %xmm7, %xmm15 +vpsrld $25, %xmm7, %xmm7 +vpxor %xmm7, %xmm15, %xmm7 +vpslld $ 7, %xmm4, %xmm15 +vpsrld $25, %xmm4, %xmm4 +vpxor %xmm4, %xmm15, %xmm4 +vmovdqa 112(%rsp), %xmm15 +subq $2, %rax +jnz chacha_blocks_avx_mainloop1 +vpaddd 128(%rsp), %xmm0, %xmm0 +vpaddd 144(%rsp), %xmm1, %xmm1 +vpaddd 160(%rsp), %xmm2, %xmm2 +vpaddd 176(%rsp), %xmm3, %xmm3 +vpaddd 192(%rsp), %xmm4, %xmm4 +vpaddd 208(%rsp), %xmm5, %xmm5 +vpaddd 224(%rsp), %xmm6, %xmm6 +vpaddd 240(%rsp), %xmm7, %xmm7 +vpaddd 256(%rsp), %xmm8, %xmm8 +vpaddd 272(%rsp), %xmm9, %xmm9 +vpaddd 288(%rsp), %xmm10, %xmm10 +vpaddd 304(%rsp), %xmm11, %xmm11 +vpaddd 320(%rsp), %xmm12, %xmm12 +vpaddd 336(%rsp), %xmm13, %xmm13 +vpaddd 352(%rsp), %xmm14, %xmm14 +vpaddd 368(%rsp), %xmm15, %xmm15 +vmovdqa %xmm8, 384(%rsp) +vmovdqa %xmm9, 400(%rsp) +vmovdqa %xmm10, 416(%rsp) +vmovdqa %xmm11, 432(%rsp) +vmovdqa %xmm12, 448(%rsp) +vmovdqa %xmm13, 464(%rsp) +vmovdqa %xmm14, 480(%rsp) +vmovdqa %xmm15, 496(%rsp) +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +andq %rsi, %rsi +jz chacha_blocks_avx_noinput1 +vpxor 0(%rsi), %xmm0, %xmm0 +vpxor 16(%rsi), %xmm1, %xmm1 +vpxor 64(%rsi), %xmm2, %xmm2 +vpxor 80(%rsi), %xmm3, %xmm3 +vpxor 128(%rsi), %xmm4, %xmm4 +vpxor 144(%rsi), %xmm5, %xmm5 +vpxor 192(%rsi), %xmm6, %xmm6 +vpxor 208(%rsi), %xmm7, %xmm7 +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 64(%rdx) +vmovdqu %xmm3, 80(%rdx) +vmovdqu %xmm4, 128(%rdx) +vmovdqu %xmm5, 144(%rdx) +vmovdqu %xmm6, 192(%rdx) +vmovdqu %xmm7, 208(%rdx) +vmovdqa 384(%rsp), %xmm0 +vmovdqa 400(%rsp), %xmm1 +vmovdqa 416(%rsp), %xmm2 +vmovdqa 432(%rsp), %xmm3 +vmovdqa 448(%rsp), %xmm4 +vmovdqa 464(%rsp), %xmm5 +vmovdqa 480(%rsp), %xmm6 +vmovdqa 496(%rsp), %xmm7 +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +vpxor 32(%rsi), %xmm0, %xmm0 +vpxor 48(%rsi), %xmm1, %xmm1 +vpxor 96(%rsi), %xmm2, %xmm2 +vpxor 112(%rsi), %xmm3, %xmm3 +vpxor 160(%rsi), %xmm4, %xmm4 +vpxor 176(%rsi), %xmm5, %xmm5 +vpxor 224(%rsi), %xmm6, %xmm6 +vpxor 240(%rsi), %xmm7, %xmm7 +vmovdqu %xmm0, 32(%rdx) +vmovdqu %xmm1, 48(%rdx) +vmovdqu %xmm2, 96(%rdx) +vmovdqu %xmm3, 112(%rdx) +vmovdqu %xmm4, 160(%rdx) +vmovdqu %xmm5, 176(%rdx) +vmovdqu %xmm6, 224(%rdx) +vmovdqu %xmm7, 240(%rdx) +addq $256, %rsi +jmp chacha_blocks_avx_mainloop_cont +chacha_blocks_avx_noinput1: +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 64(%rdx) +vmovdqu %xmm3, 80(%rdx) +vmovdqu %xmm4, 128(%rdx) +vmovdqu %xmm5, 144(%rdx) +vmovdqu %xmm6, 192(%rdx) +vmovdqu %xmm7, 208(%rdx) +vmovdqa 384(%rsp), %xmm0 +vmovdqa 400(%rsp), %xmm1 +vmovdqa 416(%rsp), %xmm2 +vmovdqa 432(%rsp), %xmm3 +vmovdqa 448(%rsp), %xmm4 +vmovdqa 464(%rsp), %xmm5 +vmovdqa 480(%rsp), %xmm6 +vmovdqa 496(%rsp), %xmm7 +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +vmovdqu %xmm0, 32(%rdx) +vmovdqu %xmm1, 48(%rdx) +vmovdqu %xmm2, 96(%rdx) +vmovdqu %xmm3, 112(%rdx) +vmovdqu %xmm4, 160(%rdx) +vmovdqu %xmm5, 176(%rdx) +vmovdqu %xmm6, 224(%rdx) +vmovdqu %xmm7, 240(%rdx) +chacha_blocks_avx_mainloop_cont: +addq $256, %rdx +subq $256, %rcx +cmp $256, %rcx +jae chacha_blocks_avx_atleast256 +vmovdqa 80(%rsp), %xmm6 +vmovdqa 96(%rsp), %xmm7 +vmovdqa 0(%rsp), %xmm8 +vmovdqa 16(%rsp), %xmm9 +vmovdqa 32(%rsp), %xmm10 +vmovdqa 48(%rsp), %xmm11 +movq $1, %r9 +chacha_blocks_avx_below256: +vmovq %r9, %xmm5 +andq %rcx, %rcx +jz chacha_blocks_avx_done +cmpq $64, %rcx +jae chacha_blocks_avx_above63 +movq %rdx, %r9 +andq %rsi, %rsi +jz chacha_blocks_avx_noinput2 +movq %rcx, %r10 +movq %rsp, %rdx +addq %r10, %rsi +addq %r10, %rdx +negq %r10 +chacha_blocks_avx_copyinput: +movb (%rsi, %r10), %al +movb %al, (%rdx, %r10) +incq %r10 +jnz chacha_blocks_avx_copyinput +movq %rsp, %rsi +chacha_blocks_avx_noinput2: +movq %rsp, %rdx +chacha_blocks_avx_above63: +vmovdqa %xmm8, %xmm0 +vmovdqa %xmm9, %xmm1 +vmovdqa %xmm10, %xmm2 +vmovdqa %xmm11, %xmm3 +movq 64(%rsp), %rax +chacha_blocks_avx_mainloop2: +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm7, %xmm3, %xmm3 +vpshufd $0x93, %xmm0, %xmm0 +vpaddd %xmm2, %xmm3, %xmm2 +vpshufd $0x4e, %xmm3, %xmm3 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x39, %xmm2, %xmm2 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm7, %xmm3, %xmm3 +vpshufd $0x39, %xmm0, %xmm0 +vpaddd %xmm2, %xmm3, %xmm2 +vpshufd $0x4e, %xmm3, %xmm3 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x93, %xmm2, %xmm2 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +subq $2, %rax +jnz chacha_blocks_avx_mainloop2 +vpaddd %xmm0, %xmm8, %xmm0 +vpaddd %xmm1, %xmm9, %xmm1 +vpaddd %xmm2, %xmm10, %xmm2 +vpaddd %xmm3, %xmm11, %xmm3 +andq %rsi, %rsi +jz chacha_blocks_avx_noinput3 +vpxor 0(%rsi), %xmm0, %xmm0 +vpxor 16(%rsi), %xmm1, %xmm1 +vpxor 32(%rsi), %xmm2, %xmm2 +vpxor 48(%rsi), %xmm3, %xmm3 +addq $64, %rsi +chacha_blocks_avx_noinput3: +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 32(%rdx) +vmovdqu %xmm3, 48(%rdx) +vpaddq %xmm11, %xmm5, %xmm11 +cmpq $64, %rcx +jbe chacha_blocks_avx_mainloop2_finishup +addq $64, %rdx +subq $64, %rcx +jmp chacha_blocks_avx_below256 +chacha_blocks_avx_mainloop2_finishup: +cmpq $64, %rcx +je chacha_blocks_avx_done +addq %rcx, %r9 +addq %rcx, %rdx +negq %rcx +chacha_blocks_avx_copyoutput: +movb (%rdx, %rcx), %al +movb %al, (%r9, %rcx) +incq %rcx +jnz chacha_blocks_avx_copyoutput +chacha_blocks_avx_done: +vmovdqu %xmm11, 32(%rdi) +movq %rbp, %rsp +popq %rbp +popq %rbx +ret +FN_END chacha_blocks_avx + +GLOBAL_HIDDEN_FN hchacha_avx +hchacha_avx_local: +LOAD_VAR_PIC chacha_constants, %rax +vmovdqa 0(%rax), %xmm0 +vmovdqa 16(%rax), %xmm6 +vmovdqa 32(%rax), %xmm5 +vmovdqu 0(%rdi), %xmm1 +vmovdqu 16(%rdi), %xmm2 +vmovdqu 0(%rsi), %xmm3 +hhacha_mainloop_avx: +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm5, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpshufd $0x93, %xmm0, %xmm0 +vpxor %xmm1, %xmm4, %xmm1 +vpshufd $0x4e, %xmm3, %xmm3 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpshufd $0x39, %xmm2, %xmm2 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm5, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x39, %xmm0, %xmm0 +vpslld $7, %xmm1, %xmm4 +vpshufd $0x4e, %xmm3, %xmm3 +vpsrld $25, %xmm1, %xmm1 +vpshufd $0x93, %xmm2, %xmm2 +vpxor %xmm1, %xmm4, %xmm1 +subl $2, %ecx +jne hhacha_mainloop_avx +vmovdqu %xmm0, (%rdx) +vmovdqu %xmm3, 16(%rdx) +ret +FN_END hchacha_avx + +GLOBAL_HIDDEN_FN_EXT chacha_avx, 6, 16 +pushq %rbp +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +vmovdqu 0(%rdi), %xmm0 +vmovdqu 16(%rdi), %xmm1 +vmovdqa %xmm0, 0(%rsp) +vmovdqa %xmm1, 16(%rsp) +xorq %rdi, %rdi +movq %rdi, 32(%rsp) +movq 0(%rsi), %rsi +movq %rsi, 40(%rsp) +movq %r9, 48(%rsp) +movq %rsp, %rdi +movq %rdx, %rsi +movq %rcx, %rdx +movq %r8, %rcx +call chacha_blocks_avx_local +vpxor %xmm0, %xmm0, %xmm0 +vmovdqa %xmm0, 0(%rsp) +vmovdqa %xmm0, 16(%rsp) +vmovdqa %xmm0, 32(%rsp) +movq %rbp, %rsp +popq %rbp +ret +FN_END chacha_avx + +GLOBAL_HIDDEN_FN_EXT xchacha_avx, 6, 16 +pushq %rbp +pushq %rbx +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +movq %rsp, %rbx +xorq %rax, %rax +movq %rax, 32(%rbx) +movq 16(%rsi), %rax +movq %rax, 40(%rbx) +movq %r9, 48(%rbx) +pushq %rdx +pushq %rcx +pushq %r8 +movq %rbx, %rdx +movq %r9, %rcx +call hchacha_avx_local +movq %rbx, %rdi +popq %rcx +popq %rdx +popq %rsi +call chacha_blocks_avx_local +vpxor %xmm0, %xmm0, %xmm0 +vmovdqa %xmm0, 0(%rbx) +vmovdqa %xmm0, 16(%rbx) +vmovdqa %xmm0, 32(%rbx) +movq %rbp, %rsp +popq %rbx +popq %rbp +ret +FN_END xchacha_avx diff --git a/src/libcryptobox/chacha20/avx2.S b/src/libcryptobox/chacha20/avx2.S new file mode 100644 index 0000000..efd0f54 --- /dev/null +++ b/src/libcryptobox/chacha20/avx2.S @@ -0,0 +1,1018 @@ +#include "../macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN chacha_blocks_avx2 +chacha_blocks_avx2_local: +pushq %rbx +pushq %rbp +pushq %r12 +pushq %r13 +pushq %r14 +movq %rsp, %rbp +andq $~63, %rsp +subq $512, %rsp +LOAD_VAR_PIC chacha_constants, %rax +vmovdqa 0(%rax), %xmm8 +vmovdqa 16(%rax), %xmm6 +vmovdqa 32(%rax), %xmm7 +vmovdqu 0(%rdi), %xmm9 +vmovdqu 16(%rdi), %xmm10 +vmovdqu 32(%rdi), %xmm11 +movq 48(%rdi), %rax +movq $1, %r9 +vmovdqa %xmm8, 0(%rsp) +vmovdqa %xmm9, 16(%rsp) +vmovdqa %xmm10, 32(%rsp) +vmovdqa %xmm11, 48(%rsp) +movq %rax, 64(%rsp) +vmovdqa %xmm6, 448(%rsp) +vmovdqa %xmm6, 464(%rsp) +vmovdqa %xmm7, 480(%rsp) +vmovdqa %xmm7, 496(%rsp) +cmpq $512, %rcx +jae chacha_blocks_avx2_atleast512 +cmp $256, %rcx +jae chacha_blocks_avx2_atleast256 +jmp chacha_blocks_avx2_below256 +.p2align 6,,63 +chacha_blocks_avx2_atleast512: +movq 48(%rsp), %rax +leaq 1(%rax), %r8 +leaq 2(%rax), %r9 +leaq 3(%rax), %r10 +leaq 4(%rax), %rbx +leaq 5(%rax), %r11 +leaq 6(%rax), %r12 +leaq 7(%rax), %r13 +leaq 8(%rax), %r14 +movl %eax, 128(%rsp) +movl %r8d, 4+128(%rsp) +movl %r9d, 8+128(%rsp) +movl %r10d, 12+128(%rsp) +movl %ebx, 16+128(%rsp) +movl %r11d, 20+128(%rsp) +movl %r12d, 24+128(%rsp) +movl %r13d, 28+128(%rsp) +shrq $32, %rax +shrq $32, %r8 +shrq $32, %r9 +shrq $32, %r10 +shrq $32, %rbx +shrq $32, %r11 +shrq $32, %r12 +shrq $32, %r13 +movl %eax, 160(%rsp) +movl %r8d, 4+160(%rsp) +movl %r9d, 8+160(%rsp) +movl %r10d, 12+160(%rsp) +movl %ebx, 16+160(%rsp) +movl %r11d, 20+160(%rsp) +movl %r12d, 24+160(%rsp) +movl %r13d, 28+160(%rsp) +movq %r14, 48(%rsp) +movq 64(%rsp), %rax +vpbroadcastd 0(%rsp), %ymm0 +vpbroadcastd 4+0(%rsp), %ymm1 +vpbroadcastd 8+0(%rsp), %ymm2 +vpbroadcastd 12+0(%rsp), %ymm3 +vpbroadcastd 16(%rsp), %ymm4 +vpbroadcastd 4+16(%rsp), %ymm5 +vpbroadcastd 8+16(%rsp), %ymm6 +vpbroadcastd 12+16(%rsp), %ymm7 +vpbroadcastd 32(%rsp), %ymm8 +vpbroadcastd 4+32(%rsp), %ymm9 +vpbroadcastd 8+32(%rsp), %ymm10 +vpbroadcastd 12+32(%rsp), %ymm11 +vpbroadcastd 8+48(%rsp), %ymm14 +vpbroadcastd 12+48(%rsp), %ymm15 +vmovdqa 128(%rsp), %ymm12 +vmovdqa 160(%rsp), %ymm13 +chacha_blocks_avx2_mainloop1: +vpaddd %ymm0, %ymm4, %ymm0 +vpaddd %ymm1, %ymm5, %ymm1 +vpxor %ymm12, %ymm0, %ymm12 +vpxor %ymm13, %ymm1, %ymm13 +vpaddd %ymm2, %ymm6, %ymm2 +vpaddd %ymm3, %ymm7, %ymm3 +vpxor %ymm14, %ymm2, %ymm14 +vpxor %ymm15, %ymm3, %ymm15 +vpshufb 448(%rsp), %ymm12, %ymm12 +vpshufb 448(%rsp), %ymm13, %ymm13 +vpaddd %ymm8, %ymm12, %ymm8 +vpaddd %ymm9, %ymm13, %ymm9 +vpshufb 448(%rsp), %ymm14, %ymm14 +vpshufb 448(%rsp), %ymm15, %ymm15 +vpaddd %ymm10, %ymm14, %ymm10 +vpaddd %ymm11, %ymm15, %ymm11 +vmovdqa %ymm12, 96(%rsp) +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm5, %ymm9, %ymm5 +vpslld $ 12, %ymm4, %ymm12 +vpsrld $20, %ymm4, %ymm4 +vpxor %ymm4, %ymm12, %ymm4 +vpslld $ 12, %ymm5, %ymm12 +vpsrld $20, %ymm5, %ymm5 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm6, %ymm10, %ymm6 +vpxor %ymm7, %ymm11, %ymm7 +vpslld $ 12, %ymm6, %ymm12 +vpsrld $20, %ymm6, %ymm6 +vpxor %ymm6, %ymm12, %ymm6 +vpslld $ 12, %ymm7, %ymm12 +vpsrld $20, %ymm7, %ymm7 +vpxor %ymm7, %ymm12, %ymm7 +vpaddd %ymm0, %ymm4, %ymm0 +vpaddd %ymm1, %ymm5, %ymm1 +vpxor 96(%rsp), %ymm0, %ymm12 +vpxor %ymm13, %ymm1, %ymm13 +vpaddd %ymm2, %ymm6, %ymm2 +vpaddd %ymm3, %ymm7, %ymm3 +vpxor %ymm14, %ymm2, %ymm14 +vpxor %ymm15, %ymm3, %ymm15 +vpshufb 480(%rsp), %ymm12, %ymm12 +vpshufb 480(%rsp), %ymm13, %ymm13 +vpaddd %ymm8, %ymm12, %ymm8 +vpaddd %ymm9, %ymm13, %ymm9 +vpshufb 480(%rsp), %ymm14, %ymm14 +vpshufb 480(%rsp), %ymm15, %ymm15 +vpaddd %ymm10, %ymm14, %ymm10 +vpaddd %ymm11, %ymm15, %ymm11 +vmovdqa %ymm12, 96(%rsp) +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm5, %ymm9, %ymm5 +vpslld $ 7, %ymm4, %ymm12 +vpsrld $25, %ymm4, %ymm4 +vpxor %ymm4, %ymm12, %ymm4 +vpslld $ 7, %ymm5, %ymm12 +vpsrld $25, %ymm5, %ymm5 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm6, %ymm10, %ymm6 +vpxor %ymm7, %ymm11, %ymm7 +vpslld $ 7, %ymm6, %ymm12 +vpsrld $25, %ymm6, %ymm6 +vpxor %ymm6, %ymm12, %ymm6 +vpslld $ 7, %ymm7, %ymm12 +vpsrld $25, %ymm7, %ymm7 +vpxor %ymm7, %ymm12, %ymm7 +vpaddd %ymm0, %ymm5, %ymm0 +vpaddd %ymm1, %ymm6, %ymm1 +vpxor %ymm15, %ymm0, %ymm15 +vpxor 96(%rsp), %ymm1, %ymm12 +vpaddd %ymm2, %ymm7, %ymm2 +vpaddd %ymm3, %ymm4, %ymm3 +vpxor %ymm13, %ymm2, %ymm13 +vpxor %ymm14, %ymm3, %ymm14 +vpshufb 448(%rsp), %ymm15, %ymm15 +vpshufb 448(%rsp), %ymm12, %ymm12 +vpaddd %ymm10, %ymm15, %ymm10 +vpaddd %ymm11, %ymm12, %ymm11 +vpshufb 448(%rsp), %ymm13, %ymm13 +vpshufb 448(%rsp), %ymm14, %ymm14 +vpaddd %ymm8, %ymm13, %ymm8 +vpaddd %ymm9, %ymm14, %ymm9 +vmovdqa %ymm15, 96(%rsp) +vpxor %ymm5, %ymm10, %ymm5 +vpxor %ymm6, %ymm11, %ymm6 +vpslld $ 12, %ymm5, %ymm15 +vpsrld $20, %ymm5, %ymm5 +vpxor %ymm5, %ymm15, %ymm5 +vpslld $ 12, %ymm6, %ymm15 +vpsrld $20, %ymm6, %ymm6 +vpxor %ymm6, %ymm15, %ymm6 +vpxor %ymm7, %ymm8, %ymm7 +vpxor %ymm4, %ymm9, %ymm4 +vpslld $ 12, %ymm7, %ymm15 +vpsrld $20, %ymm7, %ymm7 +vpxor %ymm7, %ymm15, %ymm7 +vpslld $ 12, %ymm4, %ymm15 +vpsrld $20, %ymm4, %ymm4 +vpxor %ymm4, %ymm15, %ymm4 +vpaddd %ymm0, %ymm5, %ymm0 +vpaddd %ymm1, %ymm6, %ymm1 +vpxor 96(%rsp), %ymm0, %ymm15 +vpxor %ymm12, %ymm1, %ymm12 +vpaddd %ymm2, %ymm7, %ymm2 +vpaddd %ymm3, %ymm4, %ymm3 +vpxor %ymm13, %ymm2, %ymm13 +vpxor %ymm14, %ymm3, %ymm14 +vpshufb 480(%rsp), %ymm15, %ymm15 +vpshufb 480(%rsp), %ymm12, %ymm12 +vpaddd %ymm10, %ymm15, %ymm10 +vpaddd %ymm11, %ymm12, %ymm11 +vpshufb 480(%rsp), %ymm13, %ymm13 +vpshufb 480(%rsp), %ymm14, %ymm14 +vpaddd %ymm8, %ymm13, %ymm8 +vpaddd %ymm9, %ymm14, %ymm9 +vmovdqa %ymm15, 96(%rsp) +vpxor %ymm5, %ymm10, %ymm5 +vpxor %ymm6, %ymm11, %ymm6 +vpslld $ 7, %ymm5, %ymm15 +vpsrld $25, %ymm5, %ymm5 +vpxor %ymm5, %ymm15, %ymm5 +vpslld $ 7, %ymm6, %ymm15 +vpsrld $25, %ymm6, %ymm6 +vpxor %ymm6, %ymm15, %ymm6 +vpxor %ymm7, %ymm8, %ymm7 +vpxor %ymm4, %ymm9, %ymm4 +vpslld $ 7, %ymm7, %ymm15 +vpsrld $25, %ymm7, %ymm7 +vpxor %ymm7, %ymm15, %ymm7 +vpslld $ 7, %ymm4, %ymm15 +vpsrld $25, %ymm4, %ymm4 +vpxor %ymm4, %ymm15, %ymm4 +vmovdqa 96(%rsp), %ymm15 +subq $2, %rax +jnz chacha_blocks_avx2_mainloop1 +vmovdqa %ymm8, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa %ymm10, 256(%rsp) +vmovdqa %ymm11, 288(%rsp) +vmovdqa %ymm12, 320(%rsp) +vmovdqa %ymm13, 352(%rsp) +vmovdqa %ymm14, 384(%rsp) +vmovdqa %ymm15, 416(%rsp) +vpbroadcastd 0(%rsp), %ymm8 +vpbroadcastd 4+0(%rsp), %ymm9 +vpbroadcastd 8+0(%rsp), %ymm10 +vpbroadcastd 12+0(%rsp), %ymm11 +vpbroadcastd 16(%rsp), %ymm12 +vpbroadcastd 4+16(%rsp), %ymm13 +vpbroadcastd 8+16(%rsp), %ymm14 +vpbroadcastd 12+16(%rsp), %ymm15 +vpaddd %ymm8, %ymm0, %ymm0 +vpaddd %ymm9, %ymm1, %ymm1 +vpaddd %ymm10, %ymm2, %ymm2 +vpaddd %ymm11, %ymm3, %ymm3 +vpaddd %ymm12, %ymm4, %ymm4 +vpaddd %ymm13, %ymm5, %ymm5 +vpaddd %ymm14, %ymm6, %ymm6 +vpaddd %ymm15, %ymm7, %ymm7 +vpunpckldq %ymm1, %ymm0, %ymm8 +vpunpckldq %ymm3, %ymm2, %ymm9 +vpunpckhdq %ymm1, %ymm0, %ymm12 +vpunpckhdq %ymm3, %ymm2, %ymm13 +vpunpckldq %ymm5, %ymm4, %ymm10 +vpunpckldq %ymm7, %ymm6, %ymm11 +vpunpckhdq %ymm5, %ymm4, %ymm14 +vpunpckhdq %ymm7, %ymm6, %ymm15 +vpunpcklqdq %ymm9, %ymm8, %ymm0 +vpunpcklqdq %ymm11, %ymm10, %ymm1 +vpunpckhqdq %ymm9, %ymm8, %ymm2 +vpunpckhqdq %ymm11, %ymm10, %ymm3 +vpunpcklqdq %ymm13, %ymm12, %ymm4 +vpunpcklqdq %ymm15, %ymm14, %ymm5 +vpunpckhqdq %ymm13, %ymm12, %ymm6 +vpunpckhqdq %ymm15, %ymm14, %ymm7 +vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 +vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 +vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 +vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 +vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 +vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 +vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 +vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 +andq %rsi, %rsi +jz chacha_blocks_avx2_noinput1 +vpxor 0(%rsi), %ymm8, %ymm8 +vpxor 64(%rsi), %ymm9, %ymm9 +vpxor 128(%rsi), %ymm10, %ymm10 +vpxor 192(%rsi), %ymm11, %ymm11 +vpxor 256(%rsi), %ymm12, %ymm12 +vpxor 320(%rsi), %ymm13, %ymm13 +vpxor 384(%rsi), %ymm14, %ymm14 +vpxor 448(%rsi), %ymm15, %ymm15 +vmovdqu %ymm8, 0(%rdx) +vmovdqu %ymm9, 64(%rdx) +vmovdqu %ymm10, 128(%rdx) +vmovdqu %ymm11, 192(%rdx) +vmovdqu %ymm12, 256(%rdx) +vmovdqu %ymm13, 320(%rdx) +vmovdqu %ymm14, 384(%rdx) +vmovdqu %ymm15, 448(%rdx) +vmovdqa 192(%rsp), %ymm0 +vmovdqa 224(%rsp), %ymm1 +vmovdqa 256(%rsp), %ymm2 +vmovdqa 288(%rsp), %ymm3 +vmovdqa 320(%rsp), %ymm4 +vmovdqa 352(%rsp), %ymm5 +vmovdqa 384(%rsp), %ymm6 +vmovdqa 416(%rsp), %ymm7 +vpbroadcastd 32(%rsp), %ymm8 +vpbroadcastd 4+32(%rsp), %ymm9 +vpbroadcastd 8+32(%rsp), %ymm10 +vpbroadcastd 12+32(%rsp), %ymm11 +vmovdqa 128(%rsp), %ymm12 +vmovdqa 160(%rsp), %ymm13 +vpbroadcastd 8+48(%rsp), %ymm14 +vpbroadcastd 12+48(%rsp), %ymm15 +vpaddd %ymm8, %ymm0, %ymm0 +vpaddd %ymm9, %ymm1, %ymm1 +vpaddd %ymm10, %ymm2, %ymm2 +vpaddd %ymm11, %ymm3, %ymm3 +vpaddd %ymm12, %ymm4, %ymm4 +vpaddd %ymm13, %ymm5, %ymm5 +vpaddd %ymm14, %ymm6, %ymm6 +vpaddd %ymm15, %ymm7, %ymm7 +vpunpckldq %ymm1, %ymm0, %ymm8 +vpunpckldq %ymm3, %ymm2, %ymm9 +vpunpckhdq %ymm1, %ymm0, %ymm12 +vpunpckhdq %ymm3, %ymm2, %ymm13 +vpunpckldq %ymm5, %ymm4, %ymm10 +vpunpckldq %ymm7, %ymm6, %ymm11 +vpunpckhdq %ymm5, %ymm4, %ymm14 +vpunpckhdq %ymm7, %ymm6, %ymm15 +vpunpcklqdq %ymm9, %ymm8, %ymm0 +vpunpcklqdq %ymm11, %ymm10, %ymm1 +vpunpckhqdq %ymm9, %ymm8, %ymm2 +vpunpckhqdq %ymm11, %ymm10, %ymm3 +vpunpcklqdq %ymm13, %ymm12, %ymm4 +vpunpcklqdq %ymm15, %ymm14, %ymm5 +vpunpckhqdq %ymm13, %ymm12, %ymm6 +vpunpckhqdq %ymm15, %ymm14, %ymm7 +vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 +vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 +vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 +vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 +vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 +vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 +vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 +vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 +vpxor 32(%rsi), %ymm8, %ymm8 +vpxor 96(%rsi), %ymm9, %ymm9 +vpxor 160(%rsi), %ymm10, %ymm10 +vpxor 224(%rsi), %ymm11, %ymm11 +vpxor 288(%rsi), %ymm12, %ymm12 +vpxor 352(%rsi), %ymm13, %ymm13 +vpxor 416(%rsi), %ymm14, %ymm14 +vpxor 480(%rsi), %ymm15, %ymm15 +vmovdqu %ymm8, 32(%rdx) +vmovdqu %ymm9, 96(%rdx) +vmovdqu %ymm10, 160(%rdx) +vmovdqu %ymm11, 224(%rdx) +vmovdqu %ymm12, 288(%rdx) +vmovdqu %ymm13, 352(%rdx) +vmovdqu %ymm14, 416(%rdx) +vmovdqu %ymm15, 480(%rdx) +addq $512, %rsi +jmp chacha_blocks_avx2_mainloop1_cont +chacha_blocks_avx2_noinput1: +vmovdqu %ymm8, 0(%rdx) +vmovdqu %ymm9, 64(%rdx) +vmovdqu %ymm10, 128(%rdx) +vmovdqu %ymm11, 192(%rdx) +vmovdqu %ymm12, 256(%rdx) +vmovdqu %ymm13, 320(%rdx) +vmovdqu %ymm14, 384(%rdx) +vmovdqu %ymm15, 448(%rdx) +vmovdqa 192(%rsp), %ymm0 +vmovdqa 224(%rsp), %ymm1 +vmovdqa 256(%rsp), %ymm2 +vmovdqa 288(%rsp), %ymm3 +vmovdqa 320(%rsp), %ymm4 +vmovdqa 352(%rsp), %ymm5 +vmovdqa 384(%rsp), %ymm6 +vmovdqa 416(%rsp), %ymm7 +vpbroadcastd 32(%rsp), %ymm8 +vpbroadcastd 4+32(%rsp), %ymm9 +vpbroadcastd 8+32(%rsp), %ymm10 +vpbroadcastd 12+32(%rsp), %ymm11 +vmovdqa 128(%rsp), %ymm12 +vmovdqa 160(%rsp), %ymm13 +vpbroadcastd 8+48(%rsp), %ymm14 +vpbroadcastd 12+48(%rsp), %ymm15 +vpaddd %ymm8, %ymm0, %ymm0 +vpaddd %ymm9, %ymm1, %ymm1 +vpaddd %ymm10, %ymm2, %ymm2 +vpaddd %ymm11, %ymm3, %ymm3 +vpaddd %ymm12, %ymm4, %ymm4 +vpaddd %ymm13, %ymm5, %ymm5 +vpaddd %ymm14, %ymm6, %ymm6 +vpaddd %ymm15, %ymm7, %ymm7 +vpunpckldq %ymm1, %ymm0, %ymm8 +vpunpckldq %ymm3, %ymm2, %ymm9 +vpunpckhdq %ymm1, %ymm0, %ymm12 +vpunpckhdq %ymm3, %ymm2, %ymm13 +vpunpckldq %ymm5, %ymm4, %ymm10 +vpunpckldq %ymm7, %ymm6, %ymm11 +vpunpckhdq %ymm5, %ymm4, %ymm14 +vpunpckhdq %ymm7, %ymm6, %ymm15 +vpunpcklqdq %ymm9, %ymm8, %ymm0 +vpunpcklqdq %ymm11, %ymm10, %ymm1 +vpunpckhqdq %ymm9, %ymm8, %ymm2 +vpunpckhqdq %ymm11, %ymm10, %ymm3 +vpunpcklqdq %ymm13, %ymm12, %ymm4 +vpunpcklqdq %ymm15, %ymm14, %ymm5 +vpunpckhqdq %ymm13, %ymm12, %ymm6 +vpunpckhqdq %ymm15, %ymm14, %ymm7 +vperm2i128 $0x20, %ymm1, %ymm0, %ymm8 +vperm2i128 $0x20, %ymm3, %ymm2, %ymm9 +vperm2i128 $0x31, %ymm1, %ymm0, %ymm12 +vperm2i128 $0x31, %ymm3, %ymm2, %ymm13 +vperm2i128 $0x20, %ymm5, %ymm4, %ymm10 +vperm2i128 $0x20, %ymm7, %ymm6, %ymm11 +vperm2i128 $0x31, %ymm5, %ymm4, %ymm14 +vperm2i128 $0x31, %ymm7, %ymm6, %ymm15 +vmovdqu %ymm8, 32(%rdx) +vmovdqu %ymm9, 96(%rdx) +vmovdqu %ymm10, 160(%rdx) +vmovdqu %ymm11, 224(%rdx) +vmovdqu %ymm12, 288(%rdx) +vmovdqu %ymm13, 352(%rdx) +vmovdqu %ymm14, 416(%rdx) +vmovdqu %ymm15, 480(%rdx) +chacha_blocks_avx2_mainloop1_cont: +addq $512, %rdx +subq $512, %rcx +cmp $512, %rcx +jae chacha_blocks_avx2_atleast512 +cmp $256, %rcx +jb chacha_blocks_avx2_below256_fixup +chacha_blocks_avx2_atleast256: +movq 48(%rsp), %rax +leaq 1(%rax), %r8 +leaq 2(%rax), %r9 +leaq 3(%rax), %r10 +leaq 4(%rax), %rbx +movl %eax, 128(%rsp) +movl %r8d, 4+128(%rsp) +movl %r9d, 8+128(%rsp) +movl %r10d, 12+128(%rsp) +shrq $32, %rax +shrq $32, %r8 +shrq $32, %r9 +shrq $32, %r10 +movl %eax, 160(%rsp) +movl %r8d, 4+160(%rsp) +movl %r9d, 8+160(%rsp) +movl %r10d, 12+160(%rsp) +movq %rbx, 48(%rsp) +movq 64(%rsp), %rax +vpbroadcastd 0(%rsp), %xmm0 +vpbroadcastd 4+0(%rsp), %xmm1 +vpbroadcastd 8+0(%rsp), %xmm2 +vpbroadcastd 12+0(%rsp), %xmm3 +vpbroadcastd 16(%rsp), %xmm4 +vpbroadcastd 4+16(%rsp), %xmm5 +vpbroadcastd 8+16(%rsp), %xmm6 +vpbroadcastd 12+16(%rsp), %xmm7 +vpbroadcastd 32(%rsp), %xmm8 +vpbroadcastd 4+32(%rsp), %xmm9 +vpbroadcastd 8+32(%rsp), %xmm10 +vpbroadcastd 12+32(%rsp), %xmm11 +vmovdqa 128(%rsp), %xmm12 +vmovdqa 160(%rsp), %xmm13 +vpbroadcastd 8+48(%rsp), %xmm14 +vpbroadcastd 12+48(%rsp), %xmm15 +chacha_blocks_avx2_mainloop2: +vpaddd %xmm0, %xmm4, %xmm0 +vpaddd %xmm1, %xmm5, %xmm1 +vpxor %xmm12, %xmm0, %xmm12 +vpxor %xmm13, %xmm1, %xmm13 +vpaddd %xmm2, %xmm6, %xmm2 +vpaddd %xmm3, %xmm7, %xmm3 +vpxor %xmm14, %xmm2, %xmm14 +vpxor %xmm15, %xmm3, %xmm15 +vpshufb 448(%rsp), %xmm12, %xmm12 +vpshufb 448(%rsp), %xmm13, %xmm13 +vpaddd %xmm8, %xmm12, %xmm8 +vpaddd %xmm9, %xmm13, %xmm9 +vpshufb 448(%rsp), %xmm14, %xmm14 +vpshufb 448(%rsp), %xmm15, %xmm15 +vpaddd %xmm10, %xmm14, %xmm10 +vpaddd %xmm11, %xmm15, %xmm11 +vmovdqa %xmm12, 96(%rsp) +vpxor %xmm4, %xmm8, %xmm4 +vpxor %xmm5, %xmm9, %xmm5 +vpslld $ 12, %xmm4, %xmm12 +vpsrld $20, %xmm4, %xmm4 +vpxor %xmm4, %xmm12, %xmm4 +vpslld $ 12, %xmm5, %xmm12 +vpsrld $20, %xmm5, %xmm5 +vpxor %xmm5, %xmm12, %xmm5 +vpxor %xmm6, %xmm10, %xmm6 +vpxor %xmm7, %xmm11, %xmm7 +vpslld $ 12, %xmm6, %xmm12 +vpsrld $20, %xmm6, %xmm6 +vpxor %xmm6, %xmm12, %xmm6 +vpslld $ 12, %xmm7, %xmm12 +vpsrld $20, %xmm7, %xmm7 +vpxor %xmm7, %xmm12, %xmm7 +vpaddd %xmm0, %xmm4, %xmm0 +vpaddd %xmm1, %xmm5, %xmm1 +vpxor 96(%rsp), %xmm0, %xmm12 +vpxor %xmm13, %xmm1, %xmm13 +vpaddd %xmm2, %xmm6, %xmm2 +vpaddd %xmm3, %xmm7, %xmm3 +vpxor %xmm14, %xmm2, %xmm14 +vpxor %xmm15, %xmm3, %xmm15 +vpshufb 480(%rsp), %xmm12, %xmm12 +vpshufb 480(%rsp), %xmm13, %xmm13 +vpaddd %xmm8, %xmm12, %xmm8 +vpaddd %xmm9, %xmm13, %xmm9 +vpshufb 480(%rsp), %xmm14, %xmm14 +vpshufb 480(%rsp), %xmm15, %xmm15 +vpaddd %xmm10, %xmm14, %xmm10 +vpaddd %xmm11, %xmm15, %xmm11 +vmovdqa %xmm12, 96(%rsp) +vpxor %xmm4, %xmm8, %xmm4 +vpxor %xmm5, %xmm9, %xmm5 +vpslld $ 7, %xmm4, %xmm12 +vpsrld $25, %xmm4, %xmm4 +vpxor %xmm4, %xmm12, %xmm4 +vpslld $ 7, %xmm5, %xmm12 +vpsrld $25, %xmm5, %xmm5 +vpxor %xmm5, %xmm12, %xmm5 +vpxor %xmm6, %xmm10, %xmm6 +vpxor %xmm7, %xmm11, %xmm7 +vpslld $ 7, %xmm6, %xmm12 +vpsrld $25, %xmm6, %xmm6 +vpxor %xmm6, %xmm12, %xmm6 +vpslld $ 7, %xmm7, %xmm12 +vpsrld $25, %xmm7, %xmm7 +vpxor %xmm7, %xmm12, %xmm7 +vpaddd %xmm0, %xmm5, %xmm0 +vpaddd %xmm1, %xmm6, %xmm1 +vpxor %xmm15, %xmm0, %xmm15 +vpxor 96(%rsp), %xmm1, %xmm12 +vpaddd %xmm2, %xmm7, %xmm2 +vpaddd %xmm3, %xmm4, %xmm3 +vpxor %xmm13, %xmm2, %xmm13 +vpxor %xmm14, %xmm3, %xmm14 +vpshufb 448(%rsp), %xmm15, %xmm15 +vpshufb 448(%rsp), %xmm12, %xmm12 +vpaddd %xmm10, %xmm15, %xmm10 +vpaddd %xmm11, %xmm12, %xmm11 +vpshufb 448(%rsp), %xmm13, %xmm13 +vpshufb 448(%rsp), %xmm14, %xmm14 +vpaddd %xmm8, %xmm13, %xmm8 +vpaddd %xmm9, %xmm14, %xmm9 +vmovdqa %xmm15, 96(%rsp) +vpxor %xmm5, %xmm10, %xmm5 +vpxor %xmm6, %xmm11, %xmm6 +vpslld $ 12, %xmm5, %xmm15 +vpsrld $20, %xmm5, %xmm5 +vpxor %xmm5, %xmm15, %xmm5 +vpslld $ 12, %xmm6, %xmm15 +vpsrld $20, %xmm6, %xmm6 +vpxor %xmm6, %xmm15, %xmm6 +vpxor %xmm7, %xmm8, %xmm7 +vpxor %xmm4, %xmm9, %xmm4 +vpslld $ 12, %xmm7, %xmm15 +vpsrld $20, %xmm7, %xmm7 +vpxor %xmm7, %xmm15, %xmm7 +vpslld $ 12, %xmm4, %xmm15 +vpsrld $20, %xmm4, %xmm4 +vpxor %xmm4, %xmm15, %xmm4 +vpaddd %xmm0, %xmm5, %xmm0 +vpaddd %xmm1, %xmm6, %xmm1 +vpxor 96(%rsp), %xmm0, %xmm15 +vpxor %xmm12, %xmm1, %xmm12 +vpaddd %xmm2, %xmm7, %xmm2 +vpaddd %xmm3, %xmm4, %xmm3 +vpxor %xmm13, %xmm2, %xmm13 +vpxor %xmm14, %xmm3, %xmm14 +vpshufb 480(%rsp), %xmm15, %xmm15 +vpshufb 480(%rsp), %xmm12, %xmm12 +vpaddd %xmm10, %xmm15, %xmm10 +vpaddd %xmm11, %xmm12, %xmm11 +vpshufb 480(%rsp), %xmm13, %xmm13 +vpshufb 480(%rsp), %xmm14, %xmm14 +vpaddd %xmm8, %xmm13, %xmm8 +vpaddd %xmm9, %xmm14, %xmm9 +vmovdqa %xmm15, 96(%rsp) +vpxor %xmm5, %xmm10, %xmm5 +vpxor %xmm6, %xmm11, %xmm6 +vpslld $ 7, %xmm5, %xmm15 +vpsrld $25, %xmm5, %xmm5 +vpxor %xmm5, %xmm15, %xmm5 +vpslld $ 7, %xmm6, %xmm15 +vpsrld $25, %xmm6, %xmm6 +vpxor %xmm6, %xmm15, %xmm6 +vpxor %xmm7, %xmm8, %xmm7 +vpxor %xmm4, %xmm9, %xmm4 +vpslld $ 7, %xmm7, %xmm15 +vpsrld $25, %xmm7, %xmm7 +vpxor %xmm7, %xmm15, %xmm7 +vpslld $ 7, %xmm4, %xmm15 +vpsrld $25, %xmm4, %xmm4 +vpxor %xmm4, %xmm15, %xmm4 +vmovdqa 96(%rsp), %xmm15 +subq $2, %rax +jnz chacha_blocks_avx2_mainloop2 +vmovdqa %xmm8, 192(%rsp) +vmovdqa %xmm9, 208(%rsp) +vmovdqa %xmm10, 224(%rsp) +vmovdqa %xmm11, 240(%rsp) +vmovdqa %xmm12, 256(%rsp) +vmovdqa %xmm13, 272(%rsp) +vmovdqa %xmm14, 288(%rsp) +vmovdqa %xmm15, 304(%rsp) +vpbroadcastd 0(%rsp), %xmm8 +vpbroadcastd 4+0(%rsp), %xmm9 +vpbroadcastd 8+0(%rsp), %xmm10 +vpbroadcastd 12+0(%rsp), %xmm11 +vpbroadcastd 16(%rsp), %xmm12 +vpbroadcastd 4+16(%rsp), %xmm13 +vpbroadcastd 8+16(%rsp), %xmm14 +vpbroadcastd 12+16(%rsp), %xmm15 +vpaddd %xmm8, %xmm0, %xmm0 +vpaddd %xmm9, %xmm1, %xmm1 +vpaddd %xmm10, %xmm2, %xmm2 +vpaddd %xmm11, %xmm3, %xmm3 +vpaddd %xmm12, %xmm4, %xmm4 +vpaddd %xmm13, %xmm5, %xmm5 +vpaddd %xmm14, %xmm6, %xmm6 +vpaddd %xmm15, %xmm7, %xmm7 +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +andq %rsi, %rsi +jz chacha_blocks_avx2_noinput2 +vpxor 0(%rsi), %xmm0, %xmm0 +vpxor 16(%rsi), %xmm1, %xmm1 +vpxor 64(%rsi), %xmm2, %xmm2 +vpxor 80(%rsi), %xmm3, %xmm3 +vpxor 128(%rsi), %xmm4, %xmm4 +vpxor 144(%rsi), %xmm5, %xmm5 +vpxor 192(%rsi), %xmm6, %xmm6 +vpxor 208(%rsi), %xmm7, %xmm7 +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 64(%rdx) +vmovdqu %xmm3, 80(%rdx) +vmovdqu %xmm4, 128(%rdx) +vmovdqu %xmm5, 144(%rdx) +vmovdqu %xmm6, 192(%rdx) +vmovdqu %xmm7, 208(%rdx) +vmovdqa 192(%rsp), %xmm0 +vmovdqa 208(%rsp), %xmm1 +vmovdqa 224(%rsp), %xmm2 +vmovdqa 240(%rsp), %xmm3 +vmovdqa 256(%rsp), %xmm4 +vmovdqa 272(%rsp), %xmm5 +vmovdqa 288(%rsp), %xmm6 +vmovdqa 304(%rsp), %xmm7 +vpbroadcastd 32(%rsp), %xmm8 +vpbroadcastd 4+32(%rsp), %xmm9 +vpbroadcastd 8+32(%rsp), %xmm10 +vpbroadcastd 12+32(%rsp), %xmm11 +vmovdqa 128(%rsp), %xmm12 +vmovdqa 160(%rsp), %xmm13 +vpbroadcastd 8+48(%rsp), %xmm14 +vpbroadcastd 12+48(%rsp), %xmm15 +vpaddd %xmm8, %xmm0, %xmm0 +vpaddd %xmm9, %xmm1, %xmm1 +vpaddd %xmm10, %xmm2, %xmm2 +vpaddd %xmm11, %xmm3, %xmm3 +vpaddd %xmm12, %xmm4, %xmm4 +vpaddd %xmm13, %xmm5, %xmm5 +vpaddd %xmm14, %xmm6, %xmm6 +vpaddd %xmm15, %xmm7, %xmm7 +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +vpxor 32(%rsi), %xmm0, %xmm0 +vpxor 48(%rsi), %xmm1, %xmm1 +vpxor 96(%rsi), %xmm2, %xmm2 +vpxor 112(%rsi), %xmm3, %xmm3 +vpxor 160(%rsi), %xmm4, %xmm4 +vpxor 176(%rsi), %xmm5, %xmm5 +vpxor 224(%rsi), %xmm6, %xmm6 +vpxor 240(%rsi), %xmm7, %xmm7 +vmovdqu %xmm0, 32(%rdx) +vmovdqu %xmm1, 48(%rdx) +vmovdqu %xmm2, 96(%rdx) +vmovdqu %xmm3, 112(%rdx) +vmovdqu %xmm4, 160(%rdx) +vmovdqu %xmm5, 176(%rdx) +vmovdqu %xmm6, 224(%rdx) +vmovdqu %xmm7, 240(%rdx) +addq $256, %rsi +jmp chacha_blocks_avx2_mainloop2_cont +chacha_blocks_avx2_noinput2: +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 64(%rdx) +vmovdqu %xmm3, 80(%rdx) +vmovdqu %xmm4, 128(%rdx) +vmovdqu %xmm5, 144(%rdx) +vmovdqu %xmm6, 192(%rdx) +vmovdqu %xmm7, 208(%rdx) +vmovdqa 192(%rsp), %xmm0 +vmovdqa 208(%rsp), %xmm1 +vmovdqa 224(%rsp), %xmm2 +vmovdqa 240(%rsp), %xmm3 +vmovdqa 256(%rsp), %xmm4 +vmovdqa 272(%rsp), %xmm5 +vmovdqa 288(%rsp), %xmm6 +vmovdqa 304(%rsp), %xmm7 +vpbroadcastd 32(%rsp), %xmm8 +vpbroadcastd 4+32(%rsp), %xmm9 +vpbroadcastd 8+32(%rsp), %xmm10 +vpbroadcastd 12+32(%rsp), %xmm11 +vmovdqa 128(%rsp), %xmm12 +vmovdqa 160(%rsp), %xmm13 +vpbroadcastd 8+48(%rsp), %xmm14 +vpbroadcastd 12+48(%rsp), %xmm15 +vpaddd %xmm8, %xmm0, %xmm0 +vpaddd %xmm9, %xmm1, %xmm1 +vpaddd %xmm10, %xmm2, %xmm2 +vpaddd %xmm11, %xmm3, %xmm3 +vpaddd %xmm12, %xmm4, %xmm4 +vpaddd %xmm13, %xmm5, %xmm5 +vpaddd %xmm14, %xmm6, %xmm6 +vpaddd %xmm15, %xmm7, %xmm7 +vpunpckldq %xmm1, %xmm0, %xmm8 +vpunpckldq %xmm3, %xmm2, %xmm9 +vpunpckhdq %xmm1, %xmm0, %xmm12 +vpunpckhdq %xmm3, %xmm2, %xmm13 +vpunpckldq %xmm5, %xmm4, %xmm10 +vpunpckldq %xmm7, %xmm6, %xmm11 +vpunpckhdq %xmm5, %xmm4, %xmm14 +vpunpckhdq %xmm7, %xmm6, %xmm15 +vpunpcklqdq %xmm9, %xmm8, %xmm0 +vpunpcklqdq %xmm11, %xmm10, %xmm1 +vpunpckhqdq %xmm9, %xmm8, %xmm2 +vpunpckhqdq %xmm11, %xmm10, %xmm3 +vpunpcklqdq %xmm13, %xmm12, %xmm4 +vpunpcklqdq %xmm15, %xmm14, %xmm5 +vpunpckhqdq %xmm13, %xmm12, %xmm6 +vpunpckhqdq %xmm15, %xmm14, %xmm7 +vmovdqu %xmm0, 32(%rdx) +vmovdqu %xmm1, 48(%rdx) +vmovdqu %xmm2, 96(%rdx) +vmovdqu %xmm3, 112(%rdx) +vmovdqu %xmm4, 160(%rdx) +vmovdqu %xmm5, 176(%rdx) +vmovdqu %xmm6, 224(%rdx) +vmovdqu %xmm7, 240(%rdx) +chacha_blocks_avx2_mainloop2_cont: +addq $256, %rdx +subq $256, %rcx +cmp $256, %rcx +jae chacha_blocks_avx2_atleast256 +chacha_blocks_avx2_below256_fixup: +vmovdqa 448(%rsp), %xmm6 +vmovdqa 480(%rsp), %xmm7 +vmovdqa 0(%rsp), %xmm8 +vmovdqa 16(%rsp), %xmm9 +vmovdqa 32(%rsp), %xmm10 +vmovdqa 48(%rsp), %xmm11 +movq $1, %r9 +chacha_blocks_avx2_below256: +vmovq %r9, %xmm5 +andq %rcx, %rcx +jz chacha_blocks_avx2_done +cmpq $64, %rcx +jae chacha_blocks_avx2_above63 +movq %rdx, %r9 +andq %rsi, %rsi +jz chacha_blocks_avx2_noinput3 +movq %rcx, %r10 +movq %rsp, %rdx +addq %r10, %rsi +addq %r10, %rdx +negq %r10 +chacha_blocks_avx2_copyinput: +movb (%rsi, %r10), %al +movb %al, (%rdx, %r10) +incq %r10 +jnz chacha_blocks_avx2_copyinput +movq %rsp, %rsi +chacha_blocks_avx2_noinput3: +movq %rsp, %rdx +chacha_blocks_avx2_above63: +vmovdqa %xmm8, %xmm0 +vmovdqa %xmm9, %xmm1 +vmovdqa %xmm10, %xmm2 +vmovdqa %xmm11, %xmm3 +movq 64(%rsp), %rax +chacha_blocks_avx2_mainloop3: +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm7, %xmm3, %xmm3 +vpshufd $0x93, %xmm0, %xmm0 +vpaddd %xmm2, %xmm3, %xmm2 +vpshufd $0x4e, %xmm3, %xmm3 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x39, %xmm2, %xmm2 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm7, %xmm3, %xmm3 +vpshufd $0x39, %xmm0, %xmm0 +vpaddd %xmm2, %xmm3, %xmm2 +vpshufd $0x4e, %xmm3, %xmm3 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x93, %xmm2, %xmm2 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +subq $2, %rax +jnz chacha_blocks_avx2_mainloop3 +vpaddd %xmm0, %xmm8, %xmm0 +vpaddd %xmm1, %xmm9, %xmm1 +vpaddd %xmm2, %xmm10, %xmm2 +vpaddd %xmm3, %xmm11, %xmm3 +andq %rsi, %rsi +jz chacha_blocks_avx2_noinput4 +vpxor 0(%rsi), %xmm0, %xmm0 +vpxor 16(%rsi), %xmm1, %xmm1 +vpxor 32(%rsi), %xmm2, %xmm2 +vpxor 48(%rsi), %xmm3, %xmm3 +addq $64, %rsi +chacha_blocks_avx2_noinput4: +vmovdqu %xmm0, 0(%rdx) +vmovdqu %xmm1, 16(%rdx) +vmovdqu %xmm2, 32(%rdx) +vmovdqu %xmm3, 48(%rdx) +vpaddq %xmm11, %xmm5, %xmm11 +cmpq $64, %rcx +jbe chacha_blocks_avx2_mainloop3_finishup +addq $64, %rdx +subq $64, %rcx +jmp chacha_blocks_avx2_below256 +chacha_blocks_avx2_mainloop3_finishup: +cmpq $64, %rcx +je chacha_blocks_avx2_done +addq %rcx, %r9 +addq %rcx, %rdx +negq %rcx +chacha_blocks_avx2_copyoutput: +movb (%rdx, %rcx), %al +movb %al, (%r9, %rcx) +incq %rcx +jnz chacha_blocks_avx2_copyoutput +chacha_blocks_avx2_done: +vmovdqu %xmm11, 32(%rdi) +movq %rbp, %rsp +popq %r14 +popq %r13 +popq %r12 +popq %rbp +popq %rbx +vzeroupper +ret +FN_END chacha_blocks_avx2 + + +GLOBAL_HIDDEN_FN hchacha_avx2 +hchacha_avx2_local: +LOAD_VAR_PIC chacha_constants, %rax +vmovdqa 0(%rax), %xmm0 +vmovdqa 16(%rax), %xmm6 +vmovdqa 32(%rax), %xmm5 +vmovdqu 0(%rdi), %xmm1 +vmovdqu 16(%rdi), %xmm2 +vmovdqu 0(%rsi), %xmm3 +hhacha_mainloop_avx2: +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm5, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $7, %xmm1, %xmm4 +vpsrld $25, %xmm1, %xmm1 +vpshufd $0x93, %xmm0, %xmm0 +vpxor %xmm1, %xmm4, %xmm1 +vpshufd $0x4e, %xmm3, %xmm3 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm6, %xmm3, %xmm3 +vpshufd $0x39, %xmm2, %xmm2 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpslld $12, %xmm1, %xmm4 +vpsrld $20, %xmm1, %xmm1 +vpxor %xmm1, %xmm4, %xmm1 +vpaddd %xmm0, %xmm1, %xmm0 +vpxor %xmm3, %xmm0, %xmm3 +vpshufb %xmm5, %xmm3, %xmm3 +vpaddd %xmm2, %xmm3, %xmm2 +vpxor %xmm1, %xmm2, %xmm1 +vpshufd $0x39, %xmm0, %xmm0 +vpslld $7, %xmm1, %xmm4 +vpshufd $0x4e, %xmm3, %xmm3 +vpsrld $25, %xmm1, %xmm1 +vpshufd $0x93, %xmm2, %xmm2 +vpxor %xmm1, %xmm4, %xmm1 +subl $2, %ecx +jne hhacha_mainloop_avx2 +vmovdqu %xmm0, (%rdx) +vmovdqu %xmm3, 16(%rdx) +ret +FN_END hchacha_avx2 + +GLOBAL_HIDDEN_FN_EXT chacha_avx2, 6, 16 +pushq %rbp +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +vmovdqu 0(%rdi), %xmm0 +vmovdqu 16(%rdi), %xmm1 +vmovdqa %xmm0, 0(%rsp) +vmovdqa %xmm1, 16(%rsp) +xorq %rdi, %rdi +movq %rdi, 32(%rsp) +movq 0(%rsi), %rsi +movq %rsi, 40(%rsp) +movq %r9, 48(%rsp) +movq %rsp, %rdi +movq %rdx, %rsi +movq %rcx, %rdx +movq %r8, %rcx +call chacha_blocks_avx2_local +vpxor %xmm0, %xmm0, %xmm0 +vmovdqa %xmm0, 0(%rsp) +vmovdqa %xmm0, 16(%rsp) +vmovdqa %xmm0, 32(%rsp) +movq %rbp, %rsp +popq %rbp +ret +FN_END chacha_avx2 + +GLOBAL_HIDDEN_FN_EXT xchacha_avx2, 6, 16 +pushq %rbp +pushq %rbx +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +movq %rsp, %rbx +xorq %rax, %rax +movq %rax, 32(%rbx) +movq 16(%rsi), %rax +movq %rax, 40(%rbx) +movq %r9, 48(%rbx) +pushq %rdx +pushq %rcx +pushq %r8 +movq %rbx, %rdx +movq %r9, %rcx +call hchacha_avx2_local +movq %rbx, %rdi +popq %rcx +popq %rdx +popq %rsi +call chacha_blocks_avx2_local +vpxor %xmm0, %xmm0, %xmm0 +vmovdqa %xmm0, 0(%rbx) +vmovdqa %xmm0, 16(%rbx) +vmovdqa %xmm0, 32(%rbx) +movq %rbp, %rsp +popq %rbx +popq %rbp +ret +FN_END xchacha_avx2 diff --git a/src/libcryptobox/chacha20/chacha.c b/src/libcryptobox/chacha20/chacha.c new file mode 100644 index 0000000..0b471c8 --- /dev/null +++ b/src/libcryptobox/chacha20/chacha.c @@ -0,0 +1,262 @@ +/* Copyright (c) 2015, Vsevolod Stakhov + * Copyright (c) 2015, Andrew Moon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "cryptobox.h" +#include "chacha.h" +#include "platform_config.h" + +extern unsigned cpu_config; + +typedef struct chacha_impl_t { + unsigned long cpu_flags; + const char *desc; + void (*chacha)(const chacha_key *key, const chacha_iv *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds); + void (*xchacha)(const chacha_key *key, const chacha_iv24 *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds); + void (*chacha_blocks)(chacha_state_internal *state, + const unsigned char *in, unsigned char *out, size_t bytes); + void (*hchacha)(const unsigned char key[32], const unsigned char iv[16], + unsigned char out[32], size_t rounds); +} chacha_impl_t; + +#define CHACHA_DECLARE(ext) \ + void chacha_##ext(const chacha_key *key, const chacha_iv *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds); \ + void xchacha_##ext(const chacha_key *key, const chacha_iv24 *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds); \ + void chacha_blocks_##ext(chacha_state_internal *state, const unsigned char *in, unsigned char *out, size_t bytes); \ + void hchacha_##ext(const unsigned char key[32], const unsigned char iv[16], unsigned char out[32], size_t rounds); +#define CHACHA_IMPL(cpuflags, desc, ext) \ + { \ + (cpuflags), desc, chacha_##ext, xchacha_##ext, chacha_blocks_##ext, hchacha_##ext \ + } + +#if defined(HAVE_AVX2) && defined(__x86_64__) +CHACHA_DECLARE(avx2) +#define CHACHA_AVX2 CHACHA_IMPL(CPUID_AVX2, "avx2", avx2) +#endif +#if defined(HAVE_AVX) && defined(__x86_64__) +CHACHA_DECLARE(avx) +#define CHACHA_AVX CHACHA_IMPL(CPUID_AVX, "avx", avx) +#endif +#if defined(HAVE_SSE2) && defined(__x86_64__) +CHACHA_DECLARE(sse2) +#define CHACHA_SSE2 CHACHA_IMPL(CPUID_SSE2, "sse2", sse2) +#endif + +CHACHA_DECLARE(ref) +#define CHACHA_GENERIC CHACHA_IMPL(0, "generic", ref) + +static const chacha_impl_t chacha_list[] = { + CHACHA_GENERIC, +#if defined(CHACHA_AVX2) && defined(__x86_64__) + CHACHA_AVX2, +#endif +#if defined(CHACHA_AVX) && defined(__x86_64__) + CHACHA_AVX, +#endif +#if defined(CHACHA_SSE2) && defined(__x86_64__) + CHACHA_SSE2 +#endif +}; + +static const chacha_impl_t *chacha_impl = &chacha_list[0]; + +static int +chacha_is_aligned(const void *p) +{ + return ((size_t) p & (sizeof(size_t) - 1)) == 0; +} + +const char * +chacha_load(void) +{ + guint i; + + if (cpu_config != 0) { + for (i = 0; i < G_N_ELEMENTS(chacha_list); i++) { + if (chacha_list[i].cpu_flags & cpu_config) { + chacha_impl = &chacha_list[i]; + break; + } + } + } + + return chacha_impl->desc; +} + +void chacha_init(chacha_state *S, const chacha_key *key, + const chacha_iv *iv, size_t rounds) +{ + chacha_state_internal *state = (chacha_state_internal *) S; + memcpy(state->s + 0, key, 32); + memset(state->s + 32, 0, 8); + memcpy(state->s + 40, iv, 8); + state->rounds = rounds; + state->leftover = 0; +} + +/* processes inlen bytes (can do partial blocks), handling input/output alignment */ +static void +chacha_consume(chacha_state_internal *state, + const unsigned char *in, unsigned char *out, size_t inlen) +{ + unsigned char buffer[16 * CHACHA_BLOCKBYTES]; + int in_aligned, out_aligned; + + /* it's ok to call with 0 bytes */ + if (!inlen) + return; + + /* if everything is aligned, handle directly */ + in_aligned = chacha_is_aligned(in); + out_aligned = chacha_is_aligned(out); + if (in_aligned && out_aligned) { + chacha_impl->chacha_blocks(state, in, out, inlen); + return; + } + + /* copy the unaligned data to an aligned buffer and process in chunks */ + while (inlen) { + const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen; + const unsigned char *src = in; + unsigned char *dst = (out_aligned) ? out : buffer; + if (!in_aligned) { + memcpy(buffer, in, bytes); + src = buffer; + } + chacha_impl->chacha_blocks(state, src, dst, bytes); + if (!out_aligned) + memcpy(out, buffer, bytes); + if (in) + in += bytes; + out += bytes; + inlen -= bytes; + } +} + +/* hchacha */ +void hchacha(const unsigned char key[32], + const unsigned char iv[16], unsigned char out[32], size_t rounds) +{ + chacha_impl->hchacha(key, iv, out, rounds); +} + +/* update, returns number of bytes written to out */ +size_t +chacha_update(chacha_state *S, const unsigned char *in, unsigned char *out, + size_t inlen) +{ + chacha_state_internal *state = (chacha_state_internal *) S; + unsigned char *out_start = out; + size_t bytes; + + /* enough for at least one block? */ + while ((state->leftover + inlen) >= CHACHA_BLOCKBYTES) { + /* handle the previous data */ + if (state->leftover) { + bytes = (CHACHA_BLOCKBYTES - state->leftover); + if (in) { + memcpy(state->buffer + state->leftover, in, bytes); + in += bytes; + } + chacha_consume(state, (in) ? state->buffer : NULL, out, + CHACHA_BLOCKBYTES); + inlen -= bytes; + out += CHACHA_BLOCKBYTES; + state->leftover = 0; + } + + /* handle the direct data */ + bytes = (inlen & ~(CHACHA_BLOCKBYTES - 1)); + if (bytes) { + chacha_consume(state, in, out, bytes); + inlen -= bytes; + if (in) + in += bytes; + out += bytes; + } + } + + /* handle leftover data */ + if (inlen) { + if (in) + memcpy(state->buffer + state->leftover, in, inlen); + else + memset(state->buffer + state->leftover, 0, inlen); + state->leftover += inlen; + } + + return out - out_start; +} + +/* finalize, write out any leftover data */ +size_t +chacha_final(chacha_state *S, unsigned char *out) +{ + chacha_state_internal *state = (chacha_state_internal *) S; + size_t leftover = state->leftover; + if (leftover) { + if (chacha_is_aligned(out)) { + chacha_impl->chacha_blocks(state, state->buffer, out, leftover); + } + else { + chacha_impl->chacha_blocks(state, state->buffer, state->buffer, + leftover); + memcpy(out, state->buffer, leftover); + } + } + rspamd_explicit_memzero(S, sizeof(chacha_state)); + return leftover; +} + +/* one-shot, input/output assumed to be word aligned */ +void chacha(const chacha_key *key, const chacha_iv *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds) +{ + chacha_impl->chacha(key, iv, in, out, inlen, rounds); +} + +/* + xchacha, chacha with a 192 bit nonce + */ + +void xchacha_init(chacha_state *S, const chacha_key *key, + const chacha_iv24 *iv, size_t rounds) +{ + chacha_key subkey; + hchacha(key->b, iv->b, subkey.b, rounds); + chacha_init(S, &subkey, (chacha_iv *) (iv->b + 16), rounds); +} + +/* one-shot, input/output assumed to be word aligned */ +void xchacha(const chacha_key *key, const chacha_iv24 *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds) +{ + chacha_impl->xchacha(key, iv, in, out, inlen, rounds); +} diff --git a/src/libcryptobox/chacha20/chacha.h b/src/libcryptobox/chacha20/chacha.h new file mode 100644 index 0000000..d05088a --- /dev/null +++ b/src/libcryptobox/chacha20/chacha.h @@ -0,0 +1,87 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Andrew Moon, Vsevolod Stakhov + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +#ifndef CHACHA_H_ +#define CHACHA_H_ + + +#define CHACHA_BLOCKBYTES 64 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct chacha_state_internal_t { + unsigned char s[48]; + size_t rounds; + size_t leftover; + unsigned char buffer[CHACHA_BLOCKBYTES]; +} chacha_state_internal; + +typedef struct chacha_state_t { + unsigned char opaque[128]; +} chacha_state; + +typedef struct chacha_key_t { + unsigned char b[32]; +} chacha_key; + +typedef struct chacha_iv_t { + unsigned char b[8]; +} chacha_iv; + +typedef struct chacha_iv24_t { + unsigned char b[24]; +} chacha_iv24; + +void hchacha(const unsigned char key[32], const unsigned char iv[16], + unsigned char out[32], size_t rounds); + +void chacha_init(chacha_state *S, const chacha_key *key, const chacha_iv *iv, + size_t rounds); + +void xchacha_init(chacha_state *S, const chacha_key *key, + const chacha_iv24 *iv, size_t rounds); + +size_t chacha_update(chacha_state *S, const unsigned char *in, + unsigned char *out, size_t inlen); + +size_t chacha_final(chacha_state *S, unsigned char *out); + +void chacha(const chacha_key *key, const chacha_iv *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds); + +void xchacha(const chacha_key *key, const chacha_iv24 *iv, + const unsigned char *in, unsigned char *out, size_t inlen, + size_t rounds); + +const char *chacha_load(void); + +#ifdef __cplusplus +} +#endif + +#endif /* CHACHA_H_ */ diff --git a/src/libcryptobox/chacha20/constants.S b/src/libcryptobox/chacha20/constants.S new file mode 100644 index 0000000..ff109a3 --- /dev/null +++ b/src/libcryptobox/chacha20/constants.S @@ -0,0 +1,6 @@ +SECTION_RODATA +.p2align 4,,15 +chacha_constants: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */ +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */ +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */ diff --git a/src/libcryptobox/chacha20/ref.c b/src/libcryptobox/chacha20/ref.c new file mode 100644 index 0000000..ee646db --- /dev/null +++ b/src/libcryptobox/chacha20/ref.c @@ -0,0 +1,272 @@ +#include "config.h" +#include "chacha.h" +#include "cryptobox.h" + +#if defined(HAVE_INT32) +typedef uint32_t chacha_int32; +#else +typedef guint32 chacha_int32; +#endif + +/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */ +static chacha_int32 +U8TO32(const unsigned char *p) +{ + return (((chacha_int32) (p[0])) | + ((chacha_int32) (p[1]) << 8) | + ((chacha_int32) (p[2]) << 16) | + ((chacha_int32) (p[3]) << 24)); +} + +/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */ +static void +U32TO8(unsigned char *p, chacha_int32 v) +{ + p[0] = (v) &0xff; + p[1] = (v >> 8) & 0xff; + p[2] = (v >> 16) & 0xff; + p[3] = (v >> 24) & 0xff; +} + +/* 32 bit left rotate */ +static chacha_int32 +ROTL32(chacha_int32 x, int k) +{ + return ((x << k) | (x >> (32 - k))) & 0xffffffff; +} + +/* "expand 32-byte k", as 4 little endian 32-bit unsigned integers */ +static const chacha_int32 chacha_constants[4] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; + +void chacha_blocks_ref(chacha_state_internal *state, const unsigned char *in, unsigned char *out, size_t bytes) +{ + chacha_int32 x[16], j[12]; + chacha_int32 t; + unsigned char *ctarget = out, tmp[64]; + size_t i, r; + + if (!bytes) return; + + j[0] = U8TO32(state->s + 0); + j[1] = U8TO32(state->s + 4); + j[2] = U8TO32(state->s + 8); + j[3] = U8TO32(state->s + 12); + j[4] = U8TO32(state->s + 16); + j[5] = U8TO32(state->s + 20); + j[6] = U8TO32(state->s + 24); + j[7] = U8TO32(state->s + 28); + j[8] = U8TO32(state->s + 32); + j[9] = U8TO32(state->s + 36); + j[10] = U8TO32(state->s + 40); + j[11] = U8TO32(state->s + 44); + + r = state->rounds; + + for (;;) { + if (bytes < 64) { + if (in) { + for (i = 0; i < bytes; i++) tmp[i] = in[i]; + in = tmp; + } + ctarget = out; + out = tmp; + } + + x[0] = chacha_constants[0]; + x[1] = chacha_constants[1]; + x[2] = chacha_constants[2]; + x[3] = chacha_constants[3]; + x[4] = j[0]; + x[5] = j[1]; + x[6] = j[2]; + x[7] = j[3]; + x[8] = j[4]; + x[9] = j[5]; + x[10] = j[6]; + x[11] = j[7]; + x[12] = j[8]; + x[13] = j[9]; + x[14] = j[10]; + x[15] = j[11]; + +#define quarter(a, b, c, d) \ + a += b; \ + t = d ^ a; \ + d = ROTL32(t, 16); \ + c += d; \ + t = b ^ c; \ + b = ROTL32(t, 12); \ + a += b; \ + t = d ^ a; \ + d = ROTL32(t, 8); \ + c += d; \ + t = b ^ c; \ + b = ROTL32(t, 7); + +#define doubleround() \ + quarter(x[0], x[4], x[8], x[12]) \ + quarter(x[1], x[5], x[9], x[13]) \ + quarter(x[2], x[6], x[10], x[14]) \ + quarter(x[3], x[7], x[11], x[15]) \ + quarter(x[0], x[5], x[10], x[15]) \ + quarter(x[1], x[6], x[11], x[12]) \ + quarter(x[2], x[7], x[8], x[13]) \ + quarter(x[3], x[4], x[9], x[14]) + + i = r; + do { + doubleround() + i -= 2; + } while (i); + + x[0] += chacha_constants[0]; + x[1] += chacha_constants[1]; + x[2] += chacha_constants[2]; + x[3] += chacha_constants[3]; + x[4] += j[0]; + x[5] += j[1]; + x[6] += j[2]; + x[7] += j[3]; + x[8] += j[4]; + x[9] += j[5]; + x[10] += j[6]; + x[11] += j[7]; + x[12] += j[8]; + x[13] += j[9]; + x[14] += j[10]; + x[15] += j[11]; + + if (in) { + U32TO8(out + 0, x[0] ^ U8TO32(in + 0)); + U32TO8(out + 4, x[1] ^ U8TO32(in + 4)); + U32TO8(out + 8, x[2] ^ U8TO32(in + 8)); + U32TO8(out + 12, x[3] ^ U8TO32(in + 12)); + U32TO8(out + 16, x[4] ^ U8TO32(in + 16)); + U32TO8(out + 20, x[5] ^ U8TO32(in + 20)); + U32TO8(out + 24, x[6] ^ U8TO32(in + 24)); + U32TO8(out + 28, x[7] ^ U8TO32(in + 28)); + U32TO8(out + 32, x[8] ^ U8TO32(in + 32)); + U32TO8(out + 36, x[9] ^ U8TO32(in + 36)); + U32TO8(out + 40, x[10] ^ U8TO32(in + 40)); + U32TO8(out + 44, x[11] ^ U8TO32(in + 44)); + U32TO8(out + 48, x[12] ^ U8TO32(in + 48)); + U32TO8(out + 52, x[13] ^ U8TO32(in + 52)); + U32TO8(out + 56, x[14] ^ U8TO32(in + 56)); + U32TO8(out + 60, x[15] ^ U8TO32(in + 60)); + in += 64; + } + else { + U32TO8(out + 0, x[0]); + U32TO8(out + 4, x[1]); + U32TO8(out + 8, x[2]); + U32TO8(out + 12, x[3]); + U32TO8(out + 16, x[4]); + U32TO8(out + 20, x[5]); + U32TO8(out + 24, x[6]); + U32TO8(out + 28, x[7]); + U32TO8(out + 32, x[8]); + U32TO8(out + 36, x[9]); + U32TO8(out + 40, x[10]); + U32TO8(out + 44, x[11]); + U32TO8(out + 48, x[12]); + U32TO8(out + 52, x[13]); + U32TO8(out + 56, x[14]); + U32TO8(out + 60, x[15]); + } + + /* increment the 64 bit counter, split in to two 32 bit halves */ + j[8]++; + if (!j[8]) + j[9]++; + + if (bytes <= 64) { + if (bytes < 64) + for (i = 0; i < bytes; i++) ctarget[i] = out[i]; + + /* store the counter back to the state */ + U32TO8(state->s + 32, j[8]); + U32TO8(state->s + 36, j[9]); + goto cleanup; + } + bytes -= 64; + out += 64; + } + +cleanup: + rspamd_explicit_memzero(j, sizeof(j)); +} + +void hchacha_ref(const unsigned char key[32], const unsigned char iv[16], unsigned char out[32], size_t rounds) +{ + chacha_int32 x[16]; + chacha_int32 t; + + x[0] = chacha_constants[0]; + x[1] = chacha_constants[1]; + x[2] = chacha_constants[2]; + x[3] = chacha_constants[3]; + x[4] = U8TO32(key + 0); + x[5] = U8TO32(key + 4); + x[6] = U8TO32(key + 8); + x[7] = U8TO32(key + 12); + x[8] = U8TO32(key + 16); + x[9] = U8TO32(key + 20); + x[10] = U8TO32(key + 24); + x[11] = U8TO32(key + 28); + x[12] = U8TO32(iv + 0); + x[13] = U8TO32(iv + 4); + x[14] = U8TO32(iv + 8); + x[15] = U8TO32(iv + 12); + + do { + doubleround() + rounds -= 2; + } while (rounds); + + /* indices for the chacha constant */ + U32TO8(out + 0, x[0]); + U32TO8(out + 4, x[1]); + U32TO8(out + 8, x[2]); + U32TO8(out + 12, x[3]); + + /* indices for the iv */ + U32TO8(out + 16, x[12]); + U32TO8(out + 20, x[13]); + U32TO8(out + 24, x[14]); + U32TO8(out + 28, x[15]); +} + +void chacha_clear_state_ref(chacha_state_internal *state) +{ + rspamd_explicit_memzero(state, 48); +} + +void chacha_ref(const chacha_key *key, const chacha_iv *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds) +{ + chacha_state_internal state; + size_t i; + for (i = 0; i < 32; i++) + state.s[i + 0] = key->b[i]; + for (i = 0; i < 8; i++) + state.s[i + 32] = 0; + for (i = 0; i < 8; i++) + state.s[i + 40] = iv->b[i]; + state.rounds = rounds; + chacha_blocks_ref(&state, in, out, inlen); + chacha_clear_state_ref(&state); +} + +void xchacha_ref(const chacha_key *key, const chacha_iv24 *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds) +{ + chacha_state_internal state; + size_t i; + hchacha_ref(key->b, iv->b, &state.s[0], rounds); + for (i = 0; i < 8; i++) + state.s[i + 32] = 0; + for (i = 0; i < 8; i++) + state.s[i + 40] = iv->b[i + 16]; + state.rounds = rounds; + chacha_blocks_ref(&state, in, out, inlen); + chacha_clear_state_ref(&state); +} diff --git a/src/libcryptobox/chacha20/sse2.S b/src/libcryptobox/chacha20/sse2.S new file mode 100644 index 0000000..a91d095 --- /dev/null +++ b/src/libcryptobox/chacha20/sse2.S @@ -0,0 +1,734 @@ +#include "../macro.S" +#include "constants.S" +SECTION_TEXT + +GLOBAL_HIDDEN_FN chacha_blocks_sse2 +chacha_blocks_sse2_local: +pushq %rbx +pushq %rbp +movq %rsp, %rbp +andq $~63, %rsp +subq $512, %rsp +movq $0x3320646e61707865, %rax +movq $0x6b20657479622d32, %r8 +movd %rax, %xmm8 +movd %r8, %xmm14 +punpcklqdq %xmm14, %xmm8 +movdqu 0(%rdi), %xmm9 +movdqu 16(%rdi), %xmm10 +movdqu 32(%rdi), %xmm11 +movq 48(%rdi), %rax +movq $1, %r9 +movdqa %xmm8, 0(%rsp) +movdqa %xmm9, 16(%rsp) +movdqa %xmm10, 32(%rsp) +movdqa %xmm11, 48(%rsp) +movq %rax, 64(%rsp) +cmpq $256, %rcx +jb chacha_blocks_sse2_below256 +pshufd $0x00, %xmm8, %xmm0 +pshufd $0x55, %xmm8, %xmm1 +pshufd $0xaa, %xmm8, %xmm2 +pshufd $0xff, %xmm8, %xmm3 +movdqa %xmm0, 128(%rsp) +movdqa %xmm1, 144(%rsp) +movdqa %xmm2, 160(%rsp) +movdqa %xmm3, 176(%rsp) +pshufd $0x00, %xmm9, %xmm0 +pshufd $0x55, %xmm9, %xmm1 +pshufd $0xaa, %xmm9, %xmm2 +pshufd $0xff, %xmm9, %xmm3 +movdqa %xmm0, 192(%rsp) +movdqa %xmm1, 208(%rsp) +movdqa %xmm2, 224(%rsp) +movdqa %xmm3, 240(%rsp) +pshufd $0x00, %xmm10, %xmm0 +pshufd $0x55, %xmm10, %xmm1 +pshufd $0xaa, %xmm10, %xmm2 +pshufd $0xff, %xmm10, %xmm3 +movdqa %xmm0, 256(%rsp) +movdqa %xmm1, 272(%rsp) +movdqa %xmm2, 288(%rsp) +movdqa %xmm3, 304(%rsp) +pshufd $0xaa, %xmm11, %xmm0 +pshufd $0xff, %xmm11, %xmm1 +movdqa %xmm0, 352(%rsp) +movdqa %xmm1, 368(%rsp) +jmp chacha_blocks_sse2_atleast256 +.p2align 6,,63 +chacha_blocks_sse2_atleast256: +movq 48(%rsp), %rax +leaq 1(%rax), %r8 +leaq 2(%rax), %r9 +leaq 3(%rax), %r10 +leaq 4(%rax), %rbx +movl %eax, 320(%rsp) +movl %r8d, 4+320(%rsp) +movl %r9d, 8+320(%rsp) +movl %r10d, 12+320(%rsp) +shrq $32, %rax +shrq $32, %r8 +shrq $32, %r9 +shrq $32, %r10 +movl %eax, 336(%rsp) +movl %r8d, 4+336(%rsp) +movl %r9d, 8+336(%rsp) +movl %r10d, 12+336(%rsp) +movq %rbx, 48(%rsp) +movq 64(%rsp), %rax +movdqa 128(%rsp), %xmm0 +movdqa 144(%rsp), %xmm1 +movdqa 160(%rsp), %xmm2 +movdqa 176(%rsp), %xmm3 +movdqa 192(%rsp), %xmm4 +movdqa 208(%rsp), %xmm5 +movdqa 224(%rsp), %xmm6 +movdqa 240(%rsp), %xmm7 +movdqa 256(%rsp), %xmm8 +movdqa 272(%rsp), %xmm9 +movdqa 288(%rsp), %xmm10 +movdqa 304(%rsp), %xmm11 +movdqa 320(%rsp), %xmm12 +movdqa 336(%rsp), %xmm13 +movdqa 352(%rsp), %xmm14 +movdqa 368(%rsp), %xmm15 +chacha_blocks_sse2_mainloop1: +paddd %xmm4, %xmm0 +paddd %xmm5, %xmm1 +pxor %xmm0, %xmm12 +pxor %xmm1, %xmm13 +paddd %xmm6, %xmm2 +paddd %xmm7, %xmm3 +movdqa %xmm6, 96(%rsp) +pxor %xmm2, %xmm14 +pxor %xmm3, %xmm15 +pshuflw $0xb1,%xmm12,%xmm12 +pshufhw $0xb1,%xmm12,%xmm12 +pshuflw $0xb1,%xmm13,%xmm13 +pshufhw $0xb1,%xmm13,%xmm13 +pshuflw $0xb1,%xmm14,%xmm14 +pshufhw $0xb1,%xmm14,%xmm14 +pshuflw $0xb1,%xmm15,%xmm15 +pshufhw $0xb1,%xmm15,%xmm15 +paddd %xmm12, %xmm8 +paddd %xmm13, %xmm9 +paddd %xmm14, %xmm10 +paddd %xmm15, %xmm11 +movdqa %xmm12, 112(%rsp) +pxor %xmm8, %xmm4 +pxor %xmm9, %xmm5 +movdqa 96(%rsp), %xmm6 +movdqa %xmm4, %xmm12 +pslld $ 12, %xmm4 +psrld $20, %xmm12 +pxor %xmm12, %xmm4 +movdqa %xmm5, %xmm12 +pslld $ 12, %xmm5 +psrld $20, %xmm12 +pxor %xmm12, %xmm5 +pxor %xmm10, %xmm6 +pxor %xmm11, %xmm7 +movdqa %xmm6, %xmm12 +pslld $ 12, %xmm6 +psrld $20, %xmm12 +pxor %xmm12, %xmm6 +movdqa %xmm7, %xmm12 +pslld $ 12, %xmm7 +psrld $20, %xmm12 +pxor %xmm12, %xmm7 +movdqa 112(%rsp), %xmm12 +paddd %xmm4, %xmm0 +paddd %xmm5, %xmm1 +pxor %xmm0, %xmm12 +pxor %xmm1, %xmm13 +paddd %xmm6, %xmm2 +paddd %xmm7, %xmm3 +movdqa %xmm6, 96(%rsp) +pxor %xmm2, %xmm14 +pxor %xmm3, %xmm15 +movdqa %xmm12, %xmm6 +pslld $ 8, %xmm12 +psrld $24, %xmm6 +pxor %xmm6, %xmm12 +movdqa %xmm13, %xmm6 +pslld $ 8, %xmm13 +psrld $24, %xmm6 +pxor %xmm6, %xmm13 +paddd %xmm12, %xmm8 +paddd %xmm13, %xmm9 +movdqa %xmm14, %xmm6 +pslld $ 8, %xmm14 +psrld $24, %xmm6 +pxor %xmm6, %xmm14 +movdqa %xmm15, %xmm6 +pslld $ 8, %xmm15 +psrld $24, %xmm6 +pxor %xmm6, %xmm15 +paddd %xmm14, %xmm10 +paddd %xmm15, %xmm11 +movdqa %xmm12, 112(%rsp) +pxor %xmm8, %xmm4 +pxor %xmm9, %xmm5 +movdqa 96(%rsp), %xmm6 +movdqa %xmm4, %xmm12 +pslld $ 7, %xmm4 +psrld $25, %xmm12 +pxor %xmm12, %xmm4 +movdqa %xmm5, %xmm12 +pslld $ 7, %xmm5 +psrld $25, %xmm12 +pxor %xmm12, %xmm5 +pxor %xmm10, %xmm6 +pxor %xmm11, %xmm7 +movdqa %xmm6, %xmm12 +pslld $ 7, %xmm6 +psrld $25, %xmm12 +pxor %xmm12, %xmm6 +movdqa %xmm7, %xmm12 +pslld $ 7, %xmm7 +psrld $25, %xmm12 +pxor %xmm12, %xmm7 +movdqa 112(%rsp), %xmm12 +paddd %xmm5, %xmm0 +paddd %xmm6, %xmm1 +pxor %xmm0, %xmm15 +pxor %xmm1, %xmm12 +paddd %xmm7, %xmm2 +paddd %xmm4, %xmm3 +movdqa %xmm7, 96(%rsp) +pxor %xmm2, %xmm13 +pxor %xmm3, %xmm14 +pshuflw $0xb1,%xmm15,%xmm15 +pshufhw $0xb1,%xmm15,%xmm15 +pshuflw $0xb1,%xmm12,%xmm12 +pshufhw $0xb1,%xmm12,%xmm12 +pshuflw $0xb1,%xmm13,%xmm13 +pshufhw $0xb1,%xmm13,%xmm13 +pshuflw $0xb1,%xmm14,%xmm14 +pshufhw $0xb1,%xmm14,%xmm14 +paddd %xmm15, %xmm10 +paddd %xmm12, %xmm11 +paddd %xmm13, %xmm8 +paddd %xmm14, %xmm9 +movdqa %xmm15, 112(%rsp) +pxor %xmm10, %xmm5 +pxor %xmm11, %xmm6 +movdqa 96(%rsp), %xmm7 +movdqa %xmm5, %xmm15 +pslld $ 12, %xmm5 +psrld $20, %xmm15 +pxor %xmm15, %xmm5 +movdqa %xmm6, %xmm15 +pslld $ 12, %xmm6 +psrld $20, %xmm15 +pxor %xmm15, %xmm6 +pxor %xmm8, %xmm7 +pxor %xmm9, %xmm4 +movdqa %xmm7, %xmm15 +pslld $ 12, %xmm7 +psrld $20, %xmm15 +pxor %xmm15, %xmm7 +movdqa %xmm4, %xmm15 +pslld $ 12, %xmm4 +psrld $20, %xmm15 +pxor %xmm15, %xmm4 +movdqa 112(%rsp), %xmm15 +paddd %xmm5, %xmm0 +paddd %xmm6, %xmm1 +pxor %xmm0, %xmm15 +pxor %xmm1, %xmm12 +paddd %xmm7, %xmm2 +paddd %xmm4, %xmm3 +movdqa %xmm7, 96(%rsp) +pxor %xmm2, %xmm13 +pxor %xmm3, %xmm14 +movdqa %xmm15, %xmm7 +pslld $ 8, %xmm15 +psrld $24, %xmm7 +pxor %xmm7, %xmm15 +movdqa %xmm12, %xmm7 +pslld $ 8, %xmm12 +psrld $24, %xmm7 +pxor %xmm7, %xmm12 +paddd %xmm15, %xmm10 +paddd %xmm12, %xmm11 +movdqa %xmm13, %xmm7 +pslld $ 8, %xmm13 +psrld $24, %xmm7 +pxor %xmm7, %xmm13 +movdqa %xmm14, %xmm7 +pslld $ 8, %xmm14 +psrld $24, %xmm7 +pxor %xmm7, %xmm14 +paddd %xmm13, %xmm8 +paddd %xmm14, %xmm9 +movdqa %xmm15, 112(%rsp) +pxor %xmm10, %xmm5 +pxor %xmm11, %xmm6 +movdqa 96(%rsp), %xmm7 +movdqa %xmm5, %xmm15 +pslld $ 7, %xmm5 +psrld $25, %xmm15 +pxor %xmm15, %xmm5 +movdqa %xmm6, %xmm15 +pslld $ 7, %xmm6 +psrld $25, %xmm15 +pxor %xmm15, %xmm6 +pxor %xmm8, %xmm7 +pxor %xmm9, %xmm4 +movdqa %xmm7, %xmm15 +pslld $ 7, %xmm7 +psrld $25, %xmm15 +pxor %xmm15, %xmm7 +movdqa %xmm4, %xmm15 +pslld $ 7, %xmm4 +psrld $25, %xmm15 +pxor %xmm15, %xmm4 +movdqa 112(%rsp), %xmm15 +subq $2, %rax +jnz chacha_blocks_sse2_mainloop1 +paddd 128(%rsp), %xmm0 +paddd 144(%rsp), %xmm1 +paddd 160(%rsp), %xmm2 +paddd 176(%rsp), %xmm3 +paddd 192(%rsp), %xmm4 +paddd 208(%rsp), %xmm5 +paddd 224(%rsp), %xmm6 +paddd 240(%rsp), %xmm7 +paddd 256(%rsp), %xmm8 +paddd 272(%rsp), %xmm9 +paddd 288(%rsp), %xmm10 +paddd 304(%rsp), %xmm11 +paddd 320(%rsp), %xmm12 +paddd 336(%rsp), %xmm13 +paddd 352(%rsp), %xmm14 +paddd 368(%rsp), %xmm15 +movdqa %xmm8, 384(%rsp) +movdqa %xmm9, 400(%rsp) +movdqa %xmm10, 416(%rsp) +movdqa %xmm11, 432(%rsp) +movdqa %xmm12, 448(%rsp) +movdqa %xmm13, 464(%rsp) +movdqa %xmm14, 480(%rsp) +movdqa %xmm15, 496(%rsp) +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +movdqa %xmm0, %xmm1 +movdqa %xmm4, %xmm3 +movdqa %xmm8, %xmm5 +movdqa %xmm10, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm1 +punpcklqdq %xmm6, %xmm3 +punpcklqdq %xmm9, %xmm5 +punpcklqdq %xmm11, %xmm7 +andq %rsi, %rsi +jz chacha_blocks_sse2_noinput1 +movdqu 0(%rsi), %xmm2 +movdqu 16(%rsi), %xmm6 +movdqu 64(%rsi), %xmm9 +movdqu 80(%rsi), %xmm11 +movdqu 128(%rsi), %xmm12 +movdqu 144(%rsi), %xmm13 +movdqu 192(%rsi), %xmm14 +movdqu 208(%rsi), %xmm15 +pxor %xmm2, %xmm5 +pxor %xmm6, %xmm7 +pxor %xmm9, %xmm8 +pxor %xmm11, %xmm10 +pxor %xmm12, %xmm1 +pxor %xmm13, %xmm3 +pxor %xmm14, %xmm0 +pxor %xmm15, %xmm4 +movdqu %xmm5, 0(%rdx) +movdqu %xmm7, 16(%rdx) +movdqu %xmm8, 64(%rdx) +movdqu %xmm10, 80(%rdx) +movdqu %xmm1, 128(%rdx) +movdqu %xmm3, 144(%rdx) +movdqu %xmm0, 192(%rdx) +movdqu %xmm4, 208(%rdx) +movdqa 384(%rsp), %xmm0 +movdqa 400(%rsp), %xmm1 +movdqa 416(%rsp), %xmm2 +movdqa 432(%rsp), %xmm3 +movdqa 448(%rsp), %xmm4 +movdqa 464(%rsp), %xmm5 +movdqa 480(%rsp), %xmm6 +movdqa 496(%rsp), %xmm7 +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +movdqa %xmm8, %xmm1 +movdqa %xmm0, %xmm3 +movdqa %xmm10, %xmm5 +movdqa %xmm4, %xmm7 +punpcklqdq %xmm9, %xmm1 +punpcklqdq %xmm11, %xmm5 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm3 +punpcklqdq %xmm6, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +movdqu 32(%rsi), %xmm2 +movdqu 48(%rsi), %xmm6 +movdqu 96(%rsi), %xmm9 +movdqu 112(%rsi), %xmm11 +movdqu 160(%rsi), %xmm12 +movdqu 176(%rsi), %xmm13 +movdqu 224(%rsi), %xmm14 +movdqu 240(%rsi), %xmm15 +pxor %xmm2, %xmm1 +pxor %xmm6, %xmm5 +pxor %xmm9, %xmm8 +pxor %xmm11, %xmm10 +pxor %xmm12, %xmm3 +pxor %xmm13, %xmm7 +pxor %xmm14, %xmm0 +pxor %xmm15, %xmm4 +movdqu %xmm1, 32(%rdx) +movdqu %xmm5, 48(%rdx) +movdqu %xmm8, 96(%rdx) +movdqu %xmm10, 112(%rdx) +movdqu %xmm3, 160(%rdx) +movdqu %xmm7, 176(%rdx) +movdqu %xmm0, 224(%rdx) +movdqu %xmm4, 240(%rdx) +addq $256, %rsi +jmp chacha_blocks_sse2_mainloop_cont +chacha_blocks_sse2_noinput1: +movdqu %xmm5, 0(%rdx) +movdqu %xmm7, 16(%rdx) +movdqu %xmm8, 64(%rdx) +movdqu %xmm10, 80(%rdx) +movdqu %xmm1, 128(%rdx) +movdqu %xmm3, 144(%rdx) +movdqu %xmm0, 192(%rdx) +movdqu %xmm4, 208(%rdx) +movdqa 384(%rsp), %xmm0 +movdqa 400(%rsp), %xmm1 +movdqa 416(%rsp), %xmm2 +movdqa 432(%rsp), %xmm3 +movdqa 448(%rsp), %xmm4 +movdqa 464(%rsp), %xmm5 +movdqa 480(%rsp), %xmm6 +movdqa 496(%rsp), %xmm7 +movdqa %xmm0, %xmm8 +movdqa %xmm2, %xmm9 +movdqa %xmm4, %xmm10 +movdqa %xmm6, %xmm11 +punpckldq %xmm1, %xmm8 +punpckldq %xmm3, %xmm9 +punpckhdq %xmm1, %xmm0 +punpckhdq %xmm3, %xmm2 +punpckldq %xmm5, %xmm10 +punpckldq %xmm7, %xmm11 +punpckhdq %xmm5, %xmm4 +punpckhdq %xmm7, %xmm6 +movdqa %xmm8, %xmm1 +movdqa %xmm0, %xmm3 +movdqa %xmm10, %xmm5 +movdqa %xmm4, %xmm7 +punpcklqdq %xmm9, %xmm1 +punpcklqdq %xmm11, %xmm5 +punpckhqdq %xmm9, %xmm8 +punpckhqdq %xmm11, %xmm10 +punpcklqdq %xmm2, %xmm3 +punpcklqdq %xmm6, %xmm7 +punpckhqdq %xmm2, %xmm0 +punpckhqdq %xmm6, %xmm4 +movdqu %xmm1, 32(%rdx) +movdqu %xmm5, 48(%rdx) +movdqu %xmm8, 96(%rdx) +movdqu %xmm10, 112(%rdx) +movdqu %xmm3, 160(%rdx) +movdqu %xmm7, 176(%rdx) +movdqu %xmm0, 224(%rdx) +movdqu %xmm4, 240(%rdx) +chacha_blocks_sse2_mainloop_cont: +addq $256, %rdx +subq $256, %rcx +cmp $256, %rcx +jae chacha_blocks_sse2_atleast256 +movdqa 0(%rsp), %xmm8 +movdqa 16(%rsp), %xmm9 +movdqa 32(%rsp), %xmm10 +movdqa 48(%rsp), %xmm11 +movq $1, %r9 +chacha_blocks_sse2_below256: +movq %r9, %xmm5 +andq %rcx, %rcx +jz chacha_blocks_sse2_done +cmpq $64, %rcx +jae chacha_blocks_sse2_above63 +movq %rdx, %r9 +andq %rsi, %rsi +jz chacha_blocks_sse2_noinput2 +movq %rcx, %r10 +movq %rsp, %rdx +addq %r10, %rsi +addq %r10, %rdx +negq %r10 +chacha_blocks_sse2_copyinput: +movb (%rsi, %r10), %al +movb %al, (%rdx, %r10) +incq %r10 +jnz chacha_blocks_sse2_copyinput +movq %rsp, %rsi +chacha_blocks_sse2_noinput2: +movq %rsp, %rdx +chacha_blocks_sse2_above63: +movdqa %xmm8, %xmm0 +movdqa %xmm9, %xmm1 +movdqa %xmm10, %xmm2 +movdqa %xmm11, %xmm3 +movq 64(%rsp), %rax +chacha_blocks_sse2_mainloop2: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subq $2, %rax +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +jnz chacha_blocks_sse2_mainloop2 +paddd %xmm8, %xmm0 +paddd %xmm9, %xmm1 +paddd %xmm10, %xmm2 +paddd %xmm11, %xmm3 +andq %rsi, %rsi +jz chacha_blocks_sse2_noinput3 +movdqu 0(%rsi), %xmm12 +movdqu 16(%rsi), %xmm13 +movdqu 32(%rsi), %xmm14 +movdqu 48(%rsi), %xmm15 +pxor %xmm12, %xmm0 +pxor %xmm13, %xmm1 +pxor %xmm14, %xmm2 +pxor %xmm15, %xmm3 +addq $64, %rsi +chacha_blocks_sse2_noinput3: +movdqu %xmm0, 0(%rdx) +movdqu %xmm1, 16(%rdx) +movdqu %xmm2, 32(%rdx) +movdqu %xmm3, 48(%rdx) +paddq %xmm5, %xmm11 +cmpq $64, %rcx +jbe chacha_blocks_sse2_mainloop2_finishup +addq $64, %rdx +subq $64, %rcx +jmp chacha_blocks_sse2_below256 +chacha_blocks_sse2_mainloop2_finishup: +cmpq $64, %rcx +je chacha_blocks_sse2_done +addq %rcx, %r9 +addq %rcx, %rdx +negq %rcx +chacha_blocks_sse2_copyoutput: +movb (%rdx, %rcx), %al +movb %al, (%r9, %rcx) +incq %rcx +jnz chacha_blocks_sse2_copyoutput +chacha_blocks_sse2_done: +movdqu %xmm11, 32(%rdi) +movq %rbp, %rsp +popq %rbp +popq %rbx +ret +FN_END chacha_blocks_sse2 + +GLOBAL_HIDDEN_FN hchacha_sse2 +hchacha_sse2_local: +movq $0x3320646e61707865, %rax +movq $0x6b20657479622d32, %r8 +movd %rax, %xmm0 +movd %r8, %xmm4 +punpcklqdq %xmm4, %xmm0 +movdqu 0(%rdi), %xmm1 +movdqu 16(%rdi), %xmm2 +movdqu 0(%rsi), %xmm3 +hchacha_sse2_mainloop: +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x93,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x39,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +subq $2, %rcx +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +pshuflw $0xb1,%xmm3,%xmm3 +pshufhw $0xb1,%xmm3,%xmm3 +paddd %xmm3, %xmm2 +pxor %xmm2, %xmm1 +movdqa %xmm1,%xmm4 +pslld $12, %xmm1 +psrld $20, %xmm4 +pxor %xmm4, %xmm1 +paddd %xmm1, %xmm0 +pxor %xmm0, %xmm3 +movdqa %xmm3,%xmm4 +pslld $8, %xmm3 +psrld $24, %xmm4 +pshufd $0x39,%xmm0,%xmm0 +pxor %xmm4, %xmm3 +paddd %xmm3, %xmm2 +pshufd $0x4e,%xmm3,%xmm3 +pxor %xmm2, %xmm1 +pshufd $0x93,%xmm2,%xmm2 +movdqa %xmm1,%xmm4 +pslld $7, %xmm1 +psrld $25, %xmm4 +pxor %xmm4, %xmm1 +ja hchacha_sse2_mainloop +movdqu %xmm0, 0(%rdx) +movdqu %xmm3, 16(%rdx) +ret +FN_END hchacha_sse2 + +GLOBAL_HIDDEN_FN_EXT chacha_sse2, 6, 16 +pushq %rbp +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +movdqu 0(%rdi), %xmm0 +movdqu 16(%rdi), %xmm1 +movdqa %xmm0, 0(%rsp) +movdqa %xmm1, 16(%rsp) +xorq %rdi, %rdi +movq %rdi, 32(%rsp) +movq 0(%rsi), %rsi +movq %rsi, 40(%rsp) +movq %r9, 48(%rsp) +movq %rsp, %rdi +movq %rdx, %rsi +movq %rcx, %rdx +movq %r8, %rcx +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%rsp) +movdqa %xmm0, 16(%rsp) +movdqa %xmm0, 32(%rsp) +movq %rbp, %rsp +popq %rbp +ret +FN_END chacha_sse2 + +GLOBAL_HIDDEN_FN_EXT xchacha_sse2, 6, 16 +pushq %rbp +pushq %rbx +movq %rsp, %rbp +subq $64, %rsp +andq $~63, %rsp +movq %rsp, %rbx +xorq %rax, %rax +movq %rax, 32(%rbx) +movq 16(%rsi), %rax +movq %rax, 40(%rbx) +movq %r9, 48(%rbx) +pushq %rdx +pushq %rcx +pushq %r8 +movq %rbx, %rdx +movq %r9, %rcx +call hchacha_sse2_local +movq %rbx, %rdi +popq %rcx +popq %rdx +popq %rsi +call chacha_blocks_sse2_local +pxor %xmm0, %xmm0 +movdqa %xmm0, 0(%rbx) +movdqa %xmm0, 16(%rbx) +movdqa %xmm0, 32(%rbx) +movq %rbp, %rsp +popq %rbx +popq %rbp +ret +FN_END xchacha_sse2 |