summaryrefslogtreecommitdiffstats
path: root/src/libcryptobox/chacha20
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/libcryptobox/chacha20/avx.S614
-rw-r--r--src/libcryptobox/chacha20/avx2.S1018
-rw-r--r--src/libcryptobox/chacha20/chacha.c262
-rw-r--r--src/libcryptobox/chacha20/chacha.h87
-rw-r--r--src/libcryptobox/chacha20/constants.S6
-rw-r--r--src/libcryptobox/chacha20/ref.c272
-rw-r--r--src/libcryptobox/chacha20/sse2.S734
7 files changed, 2993 insertions, 0 deletions
diff --git a/src/libcryptobox/chacha20/avx.S b/src/libcryptobox/chacha20/avx.S
new file mode 100644
index 0000000..7689b84
--- /dev/null
+++ b/src/libcryptobox/chacha20/avx.S
@@ -0,0 +1,614 @@
+#include "../macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN chacha_blocks_avx
+chacha_blocks_avx_local:
+pushq %rbx
+pushq %rbp
+movq %rsp, %rbp
+andq $~63, %rsp
+subq $512, %rsp
+LOAD_VAR_PIC chacha_constants, %rax
+vmovdqa 0(%rax), %xmm8
+vmovdqa 16(%rax), %xmm6
+vmovdqa 32(%rax), %xmm7
+vmovdqu 0(%rdi), %xmm9
+vmovdqu 16(%rdi), %xmm10
+vmovdqu 32(%rdi), %xmm11
+movq 48(%rdi), %rax
+movq $1, %r9
+vmovdqa %xmm8, 0(%rsp)
+vmovdqa %xmm9, 16(%rsp)
+vmovdqa %xmm10, 32(%rsp)
+vmovdqa %xmm11, 48(%rsp)
+vmovdqa %xmm6, 80(%rsp)
+vmovdqa %xmm7, 96(%rsp)
+movq %rax, 64(%rsp)
+cmpq $256, %rcx
+jb chacha_blocks_avx_below256
+vpshufd $0x00, %xmm8, %xmm0
+vpshufd $0x55, %xmm8, %xmm1
+vpshufd $0xaa, %xmm8, %xmm2
+vpshufd $0xff, %xmm8, %xmm3
+vmovdqa %xmm0, 128(%rsp)
+vmovdqa %xmm1, 144(%rsp)
+vmovdqa %xmm2, 160(%rsp)
+vmovdqa %xmm3, 176(%rsp)
+vpshufd $0x00, %xmm9, %xmm0
+vpshufd $0x55, %xmm9, %xmm1
+vpshufd $0xaa, %xmm9, %xmm2
+vpshufd $0xff, %xmm9, %xmm3
+vmovdqa %xmm0, 192(%rsp)
+vmovdqa %xmm1, 208(%rsp)
+vmovdqa %xmm2, 224(%rsp)
+vmovdqa %xmm3, 240(%rsp)
+vpshufd $0x00, %xmm10, %xmm0
+vpshufd $0x55, %xmm10, %xmm1
+vpshufd $0xaa, %xmm10, %xmm2
+vpshufd $0xff, %xmm10, %xmm3
+vmovdqa %xmm0, 256(%rsp)
+vmovdqa %xmm1, 272(%rsp)
+vmovdqa %xmm2, 288(%rsp)
+vmovdqa %xmm3, 304(%rsp)
+vpshufd $0xaa, %xmm11, %xmm0
+vpshufd $0xff, %xmm11, %xmm1
+vmovdqa %xmm0, 352(%rsp)
+vmovdqa %xmm1, 368(%rsp)
+jmp chacha_blocks_avx_atleast256
+.p2align 6,,63
+nop
+nop
+nop
+nop
+nop
+chacha_blocks_avx_atleast256:
+movq 48(%rsp), %rax
+leaq 1(%rax), %r8
+leaq 2(%rax), %r9
+leaq 3(%rax), %r10
+leaq 4(%rax), %rbx
+movl %eax, 320(%rsp)
+movl %r8d, 4+320(%rsp)
+movl %r9d, 8+320(%rsp)
+movl %r10d, 12+320(%rsp)
+shrq $32, %rax
+shrq $32, %r8
+shrq $32, %r9
+shrq $32, %r10
+movl %eax, 336(%rsp)
+movl %r8d, 4+336(%rsp)
+movl %r9d, 8+336(%rsp)
+movl %r10d, 12+336(%rsp)
+movq %rbx, 48(%rsp)
+movq 64(%rsp), %rax
+vmovdqa 128(%rsp), %xmm0
+vmovdqa 144(%rsp), %xmm1
+vmovdqa 160(%rsp), %xmm2
+vmovdqa 176(%rsp), %xmm3
+vmovdqa 192(%rsp), %xmm4
+vmovdqa 208(%rsp), %xmm5
+vmovdqa 224(%rsp), %xmm6
+vmovdqa 240(%rsp), %xmm7
+vmovdqa 256(%rsp), %xmm8
+vmovdqa 272(%rsp), %xmm9
+vmovdqa 288(%rsp), %xmm10
+vmovdqa 304(%rsp), %xmm11
+vmovdqa 320(%rsp), %xmm12
+vmovdqa 336(%rsp), %xmm13
+vmovdqa 352(%rsp), %xmm14
+vmovdqa 368(%rsp), %xmm15
+chacha_blocks_avx_mainloop1:
+vpaddd %xmm0, %xmm4, %xmm0
+vpaddd %xmm1, %xmm5, %xmm1
+vpxor %xmm12, %xmm0, %xmm12
+vpxor %xmm13, %xmm1, %xmm13
+vpaddd %xmm2, %xmm6, %xmm2
+vpaddd %xmm3, %xmm7, %xmm3
+vpxor %xmm14, %xmm2, %xmm14
+vpxor %xmm15, %xmm3, %xmm15
+vpshufb 80(%rsp), %xmm12, %xmm12
+vpshufb 80(%rsp), %xmm13, %xmm13
+vpaddd %xmm8, %xmm12, %xmm8
+vpaddd %xmm9, %xmm13, %xmm9
+vpshufb 80(%rsp), %xmm14, %xmm14
+vpshufb 80(%rsp), %xmm15, %xmm15
+vpaddd %xmm10, %xmm14, %xmm10
+vpaddd %xmm11, %xmm15, %xmm11
+vmovdqa %xmm12, 112(%rsp)
+vpxor %xmm4, %xmm8, %xmm4
+vpxor %xmm5, %xmm9, %xmm5
+vpslld $ 12, %xmm4, %xmm12
+vpsrld $20, %xmm4, %xmm4
+vpxor %xmm4, %xmm12, %xmm4
+vpslld $ 12, %xmm5, %xmm12
+vpsrld $20, %xmm5, %xmm5
+vpxor %xmm5, %xmm12, %xmm5
+vpxor %xmm6, %xmm10, %xmm6
+vpxor %xmm7, %xmm11, %xmm7
+vpslld $ 12, %xmm6, %xmm12
+vpsrld $20, %xmm6, %xmm6
+vpxor %xmm6, %xmm12, %xmm6
+vpslld $ 12, %xmm7, %xmm12
+vpsrld $20, %xmm7, %xmm7
+vpxor %xmm7, %xmm12, %xmm7
+vpaddd %xmm0, %xmm4, %xmm0
+vpaddd %xmm1, %xmm5, %xmm1
+vpxor 112(%rsp), %xmm0, %xmm12
+vpxor %xmm13, %xmm1, %xmm13
+vpaddd %xmm2, %xmm6, %xmm2
+vpaddd %xmm3, %xmm7, %xmm3
+vpxor %xmm14, %xmm2, %xmm14
+vpxor %xmm15, %xmm3, %xmm15
+vpshufb 96(%rsp), %xmm12, %xmm12
+vpshufb 96(%rsp), %xmm13, %xmm13
+vpaddd %xmm8, %xmm12, %xmm8
+vpaddd %xmm9, %xmm13, %xmm9
+vpshufb 96(%rsp), %xmm14, %xmm14
+vpshufb 96(%rsp), %xmm15, %xmm15
+vpaddd %xmm10, %xmm14, %xmm10
+vpaddd %xmm11, %xmm15, %xmm11
+vmovdqa %xmm12, 112(%rsp)
+vpxor %xmm4, %xmm8, %xmm4
+vpxor %xmm5, %xmm9, %xmm5
+vpslld $ 7, %xmm4, %xmm12
+vpsrld $25, %xmm4, %xmm4
+vpxor %xmm4, %xmm12, %xmm4
+vpslld $ 7, %xmm5, %xmm12
+vpsrld $25, %xmm5, %xmm5
+vpxor %xmm5, %xmm12, %xmm5
+vpxor %xmm6, %xmm10, %xmm6
+vpxor %xmm7, %xmm11, %xmm7
+vpslld $ 7, %xmm6, %xmm12
+vpsrld $25, %xmm6, %xmm6
+vpxor %xmm6, %xmm12, %xmm6
+vpslld $ 7, %xmm7, %xmm12
+vpsrld $25, %xmm7, %xmm7
+vpxor %xmm7, %xmm12, %xmm7
+vpaddd %xmm0, %xmm5, %xmm0
+vpaddd %xmm1, %xmm6, %xmm1
+vpxor %xmm15, %xmm0, %xmm15
+vpxor 112(%rsp), %xmm1, %xmm12
+vpaddd %xmm2, %xmm7, %xmm2
+vpaddd %xmm3, %xmm4, %xmm3
+vpxor %xmm13, %xmm2, %xmm13
+vpxor %xmm14, %xmm3, %xmm14
+vpshufb 80(%rsp), %xmm15, %xmm15
+vpshufb 80(%rsp), %xmm12, %xmm12
+vpaddd %xmm10, %xmm15, %xmm10
+vpaddd %xmm11, %xmm12, %xmm11
+vpshufb 80(%rsp), %xmm13, %xmm13
+vpshufb 80(%rsp), %xmm14, %xmm14
+vpaddd %xmm8, %xmm13, %xmm8
+vpaddd %xmm9, %xmm14, %xmm9
+vmovdqa %xmm15, 112(%rsp)
+vpxor %xmm5, %xmm10, %xmm5
+vpxor %xmm6, %xmm11, %xmm6
+vpslld $ 12, %xmm5, %xmm15
+vpsrld $20, %xmm5, %xmm5
+vpxor %xmm5, %xmm15, %xmm5
+vpslld $ 12, %xmm6, %xmm15
+vpsrld $20, %xmm6, %xmm6
+vpxor %xmm6, %xmm15, %xmm6
+vpxor %xmm7, %xmm8, %xmm7
+vpxor %xmm4, %xmm9, %xmm4
+vpslld $ 12, %xmm7, %xmm15
+vpsrld $20, %xmm7, %xmm7
+vpxor %xmm7, %xmm15, %xmm7
+vpslld $ 12, %xmm4, %xmm15
+vpsrld $20, %xmm4, %xmm4
+vpxor %xmm4, %xmm15, %xmm4
+vpaddd %xmm0, %xmm5, %xmm0
+vpaddd %xmm1, %xmm6, %xmm1
+vpxor 112(%rsp), %xmm0, %xmm15
+vpxor %xmm12, %xmm1, %xmm12
+vpaddd %xmm2, %xmm7, %xmm2
+vpaddd %xmm3, %xmm4, %xmm3
+vpxor %xmm13, %xmm2, %xmm13
+vpxor %xmm14, %xmm3, %xmm14
+vpshufb 96(%rsp), %xmm15, %xmm15
+vpshufb 96(%rsp), %xmm12, %xmm12
+vpaddd %xmm10, %xmm15, %xmm10
+vpaddd %xmm11, %xmm12, %xmm11
+vpshufb 96(%rsp), %xmm13, %xmm13
+vpshufb 96(%rsp), %xmm14, %xmm14
+vpaddd %xmm8, %xmm13, %xmm8
+vpaddd %xmm9, %xmm14, %xmm9
+vmovdqa %xmm15, 112(%rsp)
+vpxor %xmm5, %xmm10, %xmm5
+vpxor %xmm6, %xmm11, %xmm6
+vpslld $ 7, %xmm5, %xmm15
+vpsrld $25, %xmm5, %xmm5
+vpxor %xmm5, %xmm15, %xmm5
+vpslld $ 7, %xmm6, %xmm15
+vpsrld $25, %xmm6, %xmm6
+vpxor %xmm6, %xmm15, %xmm6
+vpxor %xmm7, %xmm8, %xmm7
+vpxor %xmm4, %xmm9, %xmm4
+vpslld $ 7, %xmm7, %xmm15
+vpsrld $25, %xmm7, %xmm7
+vpxor %xmm7, %xmm15, %xmm7
+vpslld $ 7, %xmm4, %xmm15
+vpsrld $25, %xmm4, %xmm4
+vpxor %xmm4, %xmm15, %xmm4
+vmovdqa 112(%rsp), %xmm15
+subq $2, %rax
+jnz chacha_blocks_avx_mainloop1
+vpaddd 128(%rsp), %xmm0, %xmm0
+vpaddd 144(%rsp), %xmm1, %xmm1
+vpaddd 160(%rsp), %xmm2, %xmm2
+vpaddd 176(%rsp), %xmm3, %xmm3
+vpaddd 192(%rsp), %xmm4, %xmm4
+vpaddd 208(%rsp), %xmm5, %xmm5
+vpaddd 224(%rsp), %xmm6, %xmm6
+vpaddd 240(%rsp), %xmm7, %xmm7
+vpaddd 256(%rsp), %xmm8, %xmm8
+vpaddd 272(%rsp), %xmm9, %xmm9
+vpaddd 288(%rsp), %xmm10, %xmm10
+vpaddd 304(%rsp), %xmm11, %xmm11
+vpaddd 320(%rsp), %xmm12, %xmm12
+vpaddd 336(%rsp), %xmm13, %xmm13
+vpaddd 352(%rsp), %xmm14, %xmm14
+vpaddd 368(%rsp), %xmm15, %xmm15
+vmovdqa %xmm8, 384(%rsp)
+vmovdqa %xmm9, 400(%rsp)
+vmovdqa %xmm10, 416(%rsp)
+vmovdqa %xmm11, 432(%rsp)
+vmovdqa %xmm12, 448(%rsp)
+vmovdqa %xmm13, 464(%rsp)
+vmovdqa %xmm14, 480(%rsp)
+vmovdqa %xmm15, 496(%rsp)
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+andq %rsi, %rsi
+jz chacha_blocks_avx_noinput1
+vpxor 0(%rsi), %xmm0, %xmm0
+vpxor 16(%rsi), %xmm1, %xmm1
+vpxor 64(%rsi), %xmm2, %xmm2
+vpxor 80(%rsi), %xmm3, %xmm3
+vpxor 128(%rsi), %xmm4, %xmm4
+vpxor 144(%rsi), %xmm5, %xmm5
+vpxor 192(%rsi), %xmm6, %xmm6
+vpxor 208(%rsi), %xmm7, %xmm7
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 64(%rdx)
+vmovdqu %xmm3, 80(%rdx)
+vmovdqu %xmm4, 128(%rdx)
+vmovdqu %xmm5, 144(%rdx)
+vmovdqu %xmm6, 192(%rdx)
+vmovdqu %xmm7, 208(%rdx)
+vmovdqa 384(%rsp), %xmm0
+vmovdqa 400(%rsp), %xmm1
+vmovdqa 416(%rsp), %xmm2
+vmovdqa 432(%rsp), %xmm3
+vmovdqa 448(%rsp), %xmm4
+vmovdqa 464(%rsp), %xmm5
+vmovdqa 480(%rsp), %xmm6
+vmovdqa 496(%rsp), %xmm7
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+vpxor 32(%rsi), %xmm0, %xmm0
+vpxor 48(%rsi), %xmm1, %xmm1
+vpxor 96(%rsi), %xmm2, %xmm2
+vpxor 112(%rsi), %xmm3, %xmm3
+vpxor 160(%rsi), %xmm4, %xmm4
+vpxor 176(%rsi), %xmm5, %xmm5
+vpxor 224(%rsi), %xmm6, %xmm6
+vpxor 240(%rsi), %xmm7, %xmm7
+vmovdqu %xmm0, 32(%rdx)
+vmovdqu %xmm1, 48(%rdx)
+vmovdqu %xmm2, 96(%rdx)
+vmovdqu %xmm3, 112(%rdx)
+vmovdqu %xmm4, 160(%rdx)
+vmovdqu %xmm5, 176(%rdx)
+vmovdqu %xmm6, 224(%rdx)
+vmovdqu %xmm7, 240(%rdx)
+addq $256, %rsi
+jmp chacha_blocks_avx_mainloop_cont
+chacha_blocks_avx_noinput1:
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 64(%rdx)
+vmovdqu %xmm3, 80(%rdx)
+vmovdqu %xmm4, 128(%rdx)
+vmovdqu %xmm5, 144(%rdx)
+vmovdqu %xmm6, 192(%rdx)
+vmovdqu %xmm7, 208(%rdx)
+vmovdqa 384(%rsp), %xmm0
+vmovdqa 400(%rsp), %xmm1
+vmovdqa 416(%rsp), %xmm2
+vmovdqa 432(%rsp), %xmm3
+vmovdqa 448(%rsp), %xmm4
+vmovdqa 464(%rsp), %xmm5
+vmovdqa 480(%rsp), %xmm6
+vmovdqa 496(%rsp), %xmm7
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+vmovdqu %xmm0, 32(%rdx)
+vmovdqu %xmm1, 48(%rdx)
+vmovdqu %xmm2, 96(%rdx)
+vmovdqu %xmm3, 112(%rdx)
+vmovdqu %xmm4, 160(%rdx)
+vmovdqu %xmm5, 176(%rdx)
+vmovdqu %xmm6, 224(%rdx)
+vmovdqu %xmm7, 240(%rdx)
+chacha_blocks_avx_mainloop_cont:
+addq $256, %rdx
+subq $256, %rcx
+cmp $256, %rcx
+jae chacha_blocks_avx_atleast256
+vmovdqa 80(%rsp), %xmm6
+vmovdqa 96(%rsp), %xmm7
+vmovdqa 0(%rsp), %xmm8
+vmovdqa 16(%rsp), %xmm9
+vmovdqa 32(%rsp), %xmm10
+vmovdqa 48(%rsp), %xmm11
+movq $1, %r9
+chacha_blocks_avx_below256:
+vmovq %r9, %xmm5
+andq %rcx, %rcx
+jz chacha_blocks_avx_done
+cmpq $64, %rcx
+jae chacha_blocks_avx_above63
+movq %rdx, %r9
+andq %rsi, %rsi
+jz chacha_blocks_avx_noinput2
+movq %rcx, %r10
+movq %rsp, %rdx
+addq %r10, %rsi
+addq %r10, %rdx
+negq %r10
+chacha_blocks_avx_copyinput:
+movb (%rsi, %r10), %al
+movb %al, (%rdx, %r10)
+incq %r10
+jnz chacha_blocks_avx_copyinput
+movq %rsp, %rsi
+chacha_blocks_avx_noinput2:
+movq %rsp, %rdx
+chacha_blocks_avx_above63:
+vmovdqa %xmm8, %xmm0
+vmovdqa %xmm9, %xmm1
+vmovdqa %xmm10, %xmm2
+vmovdqa %xmm11, %xmm3
+movq 64(%rsp), %rax
+chacha_blocks_avx_mainloop2:
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm7, %xmm3, %xmm3
+vpshufd $0x93, %xmm0, %xmm0
+vpaddd %xmm2, %xmm3, %xmm2
+vpshufd $0x4e, %xmm3, %xmm3
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x39, %xmm2, %xmm2
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm7, %xmm3, %xmm3
+vpshufd $0x39, %xmm0, %xmm0
+vpaddd %xmm2, %xmm3, %xmm2
+vpshufd $0x4e, %xmm3, %xmm3
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x93, %xmm2, %xmm2
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+subq $2, %rax
+jnz chacha_blocks_avx_mainloop2
+vpaddd %xmm0, %xmm8, %xmm0
+vpaddd %xmm1, %xmm9, %xmm1
+vpaddd %xmm2, %xmm10, %xmm2
+vpaddd %xmm3, %xmm11, %xmm3
+andq %rsi, %rsi
+jz chacha_blocks_avx_noinput3
+vpxor 0(%rsi), %xmm0, %xmm0
+vpxor 16(%rsi), %xmm1, %xmm1
+vpxor 32(%rsi), %xmm2, %xmm2
+vpxor 48(%rsi), %xmm3, %xmm3
+addq $64, %rsi
+chacha_blocks_avx_noinput3:
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 32(%rdx)
+vmovdqu %xmm3, 48(%rdx)
+vpaddq %xmm11, %xmm5, %xmm11
+cmpq $64, %rcx
+jbe chacha_blocks_avx_mainloop2_finishup
+addq $64, %rdx
+subq $64, %rcx
+jmp chacha_blocks_avx_below256
+chacha_blocks_avx_mainloop2_finishup:
+cmpq $64, %rcx
+je chacha_blocks_avx_done
+addq %rcx, %r9
+addq %rcx, %rdx
+negq %rcx
+chacha_blocks_avx_copyoutput:
+movb (%rdx, %rcx), %al
+movb %al, (%r9, %rcx)
+incq %rcx
+jnz chacha_blocks_avx_copyoutput
+chacha_blocks_avx_done:
+vmovdqu %xmm11, 32(%rdi)
+movq %rbp, %rsp
+popq %rbp
+popq %rbx
+ret
+FN_END chacha_blocks_avx
+
+GLOBAL_HIDDEN_FN hchacha_avx
+hchacha_avx_local:
+LOAD_VAR_PIC chacha_constants, %rax
+vmovdqa 0(%rax), %xmm0
+vmovdqa 16(%rax), %xmm6
+vmovdqa 32(%rax), %xmm5
+vmovdqu 0(%rdi), %xmm1
+vmovdqu 16(%rdi), %xmm2
+vmovdqu 0(%rsi), %xmm3
+hhacha_mainloop_avx:
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm5, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpshufd $0x93, %xmm0, %xmm0
+vpxor %xmm1, %xmm4, %xmm1
+vpshufd $0x4e, %xmm3, %xmm3
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpshufd $0x39, %xmm2, %xmm2
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm5, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x39, %xmm0, %xmm0
+vpslld $7, %xmm1, %xmm4
+vpshufd $0x4e, %xmm3, %xmm3
+vpsrld $25, %xmm1, %xmm1
+vpshufd $0x93, %xmm2, %xmm2
+vpxor %xmm1, %xmm4, %xmm1
+subl $2, %ecx
+jne hhacha_mainloop_avx
+vmovdqu %xmm0, (%rdx)
+vmovdqu %xmm3, 16(%rdx)
+ret
+FN_END hchacha_avx
+
+GLOBAL_HIDDEN_FN_EXT chacha_avx, 6, 16
+pushq %rbp
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+vmovdqu 0(%rdi), %xmm0
+vmovdqu 16(%rdi), %xmm1
+vmovdqa %xmm0, 0(%rsp)
+vmovdqa %xmm1, 16(%rsp)
+xorq %rdi, %rdi
+movq %rdi, 32(%rsp)
+movq 0(%rsi), %rsi
+movq %rsi, 40(%rsp)
+movq %r9, 48(%rsp)
+movq %rsp, %rdi
+movq %rdx, %rsi
+movq %rcx, %rdx
+movq %r8, %rcx
+call chacha_blocks_avx_local
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqa %xmm0, 0(%rsp)
+vmovdqa %xmm0, 16(%rsp)
+vmovdqa %xmm0, 32(%rsp)
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END chacha_avx
+
+GLOBAL_HIDDEN_FN_EXT xchacha_avx, 6, 16
+pushq %rbp
+pushq %rbx
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+movq %rsp, %rbx
+xorq %rax, %rax
+movq %rax, 32(%rbx)
+movq 16(%rsi), %rax
+movq %rax, 40(%rbx)
+movq %r9, 48(%rbx)
+pushq %rdx
+pushq %rcx
+pushq %r8
+movq %rbx, %rdx
+movq %r9, %rcx
+call hchacha_avx_local
+movq %rbx, %rdi
+popq %rcx
+popq %rdx
+popq %rsi
+call chacha_blocks_avx_local
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqa %xmm0, 0(%rbx)
+vmovdqa %xmm0, 16(%rbx)
+vmovdqa %xmm0, 32(%rbx)
+movq %rbp, %rsp
+popq %rbx
+popq %rbp
+ret
+FN_END xchacha_avx
diff --git a/src/libcryptobox/chacha20/avx2.S b/src/libcryptobox/chacha20/avx2.S
new file mode 100644
index 0000000..efd0f54
--- /dev/null
+++ b/src/libcryptobox/chacha20/avx2.S
@@ -0,0 +1,1018 @@
+#include "../macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN chacha_blocks_avx2
+chacha_blocks_avx2_local:
+pushq %rbx
+pushq %rbp
+pushq %r12
+pushq %r13
+pushq %r14
+movq %rsp, %rbp
+andq $~63, %rsp
+subq $512, %rsp
+LOAD_VAR_PIC chacha_constants, %rax
+vmovdqa 0(%rax), %xmm8
+vmovdqa 16(%rax), %xmm6
+vmovdqa 32(%rax), %xmm7
+vmovdqu 0(%rdi), %xmm9
+vmovdqu 16(%rdi), %xmm10
+vmovdqu 32(%rdi), %xmm11
+movq 48(%rdi), %rax
+movq $1, %r9
+vmovdqa %xmm8, 0(%rsp)
+vmovdqa %xmm9, 16(%rsp)
+vmovdqa %xmm10, 32(%rsp)
+vmovdqa %xmm11, 48(%rsp)
+movq %rax, 64(%rsp)
+vmovdqa %xmm6, 448(%rsp)
+vmovdqa %xmm6, 464(%rsp)
+vmovdqa %xmm7, 480(%rsp)
+vmovdqa %xmm7, 496(%rsp)
+cmpq $512, %rcx
+jae chacha_blocks_avx2_atleast512
+cmp $256, %rcx
+jae chacha_blocks_avx2_atleast256
+jmp chacha_blocks_avx2_below256
+.p2align 6,,63
+chacha_blocks_avx2_atleast512:
+movq 48(%rsp), %rax
+leaq 1(%rax), %r8
+leaq 2(%rax), %r9
+leaq 3(%rax), %r10
+leaq 4(%rax), %rbx
+leaq 5(%rax), %r11
+leaq 6(%rax), %r12
+leaq 7(%rax), %r13
+leaq 8(%rax), %r14
+movl %eax, 128(%rsp)
+movl %r8d, 4+128(%rsp)
+movl %r9d, 8+128(%rsp)
+movl %r10d, 12+128(%rsp)
+movl %ebx, 16+128(%rsp)
+movl %r11d, 20+128(%rsp)
+movl %r12d, 24+128(%rsp)
+movl %r13d, 28+128(%rsp)
+shrq $32, %rax
+shrq $32, %r8
+shrq $32, %r9
+shrq $32, %r10
+shrq $32, %rbx
+shrq $32, %r11
+shrq $32, %r12
+shrq $32, %r13
+movl %eax, 160(%rsp)
+movl %r8d, 4+160(%rsp)
+movl %r9d, 8+160(%rsp)
+movl %r10d, 12+160(%rsp)
+movl %ebx, 16+160(%rsp)
+movl %r11d, 20+160(%rsp)
+movl %r12d, 24+160(%rsp)
+movl %r13d, 28+160(%rsp)
+movq %r14, 48(%rsp)
+movq 64(%rsp), %rax
+vpbroadcastd 0(%rsp), %ymm0
+vpbroadcastd 4+0(%rsp), %ymm1
+vpbroadcastd 8+0(%rsp), %ymm2
+vpbroadcastd 12+0(%rsp), %ymm3
+vpbroadcastd 16(%rsp), %ymm4
+vpbroadcastd 4+16(%rsp), %ymm5
+vpbroadcastd 8+16(%rsp), %ymm6
+vpbroadcastd 12+16(%rsp), %ymm7
+vpbroadcastd 32(%rsp), %ymm8
+vpbroadcastd 4+32(%rsp), %ymm9
+vpbroadcastd 8+32(%rsp), %ymm10
+vpbroadcastd 12+32(%rsp), %ymm11
+vpbroadcastd 8+48(%rsp), %ymm14
+vpbroadcastd 12+48(%rsp), %ymm15
+vmovdqa 128(%rsp), %ymm12
+vmovdqa 160(%rsp), %ymm13
+chacha_blocks_avx2_mainloop1:
+vpaddd %ymm0, %ymm4, %ymm0
+vpaddd %ymm1, %ymm5, %ymm1
+vpxor %ymm12, %ymm0, %ymm12
+vpxor %ymm13, %ymm1, %ymm13
+vpaddd %ymm2, %ymm6, %ymm2
+vpaddd %ymm3, %ymm7, %ymm3
+vpxor %ymm14, %ymm2, %ymm14
+vpxor %ymm15, %ymm3, %ymm15
+vpshufb 448(%rsp), %ymm12, %ymm12
+vpshufb 448(%rsp), %ymm13, %ymm13
+vpaddd %ymm8, %ymm12, %ymm8
+vpaddd %ymm9, %ymm13, %ymm9
+vpshufb 448(%rsp), %ymm14, %ymm14
+vpshufb 448(%rsp), %ymm15, %ymm15
+vpaddd %ymm10, %ymm14, %ymm10
+vpaddd %ymm11, %ymm15, %ymm11
+vmovdqa %ymm12, 96(%rsp)
+vpxor %ymm4, %ymm8, %ymm4
+vpxor %ymm5, %ymm9, %ymm5
+vpslld $ 12, %ymm4, %ymm12
+vpsrld $20, %ymm4, %ymm4
+vpxor %ymm4, %ymm12, %ymm4
+vpslld $ 12, %ymm5, %ymm12
+vpsrld $20, %ymm5, %ymm5
+vpxor %ymm5, %ymm12, %ymm5
+vpxor %ymm6, %ymm10, %ymm6
+vpxor %ymm7, %ymm11, %ymm7
+vpslld $ 12, %ymm6, %ymm12
+vpsrld $20, %ymm6, %ymm6
+vpxor %ymm6, %ymm12, %ymm6
+vpslld $ 12, %ymm7, %ymm12
+vpsrld $20, %ymm7, %ymm7
+vpxor %ymm7, %ymm12, %ymm7
+vpaddd %ymm0, %ymm4, %ymm0
+vpaddd %ymm1, %ymm5, %ymm1
+vpxor 96(%rsp), %ymm0, %ymm12
+vpxor %ymm13, %ymm1, %ymm13
+vpaddd %ymm2, %ymm6, %ymm2
+vpaddd %ymm3, %ymm7, %ymm3
+vpxor %ymm14, %ymm2, %ymm14
+vpxor %ymm15, %ymm3, %ymm15
+vpshufb 480(%rsp), %ymm12, %ymm12
+vpshufb 480(%rsp), %ymm13, %ymm13
+vpaddd %ymm8, %ymm12, %ymm8
+vpaddd %ymm9, %ymm13, %ymm9
+vpshufb 480(%rsp), %ymm14, %ymm14
+vpshufb 480(%rsp), %ymm15, %ymm15
+vpaddd %ymm10, %ymm14, %ymm10
+vpaddd %ymm11, %ymm15, %ymm11
+vmovdqa %ymm12, 96(%rsp)
+vpxor %ymm4, %ymm8, %ymm4
+vpxor %ymm5, %ymm9, %ymm5
+vpslld $ 7, %ymm4, %ymm12
+vpsrld $25, %ymm4, %ymm4
+vpxor %ymm4, %ymm12, %ymm4
+vpslld $ 7, %ymm5, %ymm12
+vpsrld $25, %ymm5, %ymm5
+vpxor %ymm5, %ymm12, %ymm5
+vpxor %ymm6, %ymm10, %ymm6
+vpxor %ymm7, %ymm11, %ymm7
+vpslld $ 7, %ymm6, %ymm12
+vpsrld $25, %ymm6, %ymm6
+vpxor %ymm6, %ymm12, %ymm6
+vpslld $ 7, %ymm7, %ymm12
+vpsrld $25, %ymm7, %ymm7
+vpxor %ymm7, %ymm12, %ymm7
+vpaddd %ymm0, %ymm5, %ymm0
+vpaddd %ymm1, %ymm6, %ymm1
+vpxor %ymm15, %ymm0, %ymm15
+vpxor 96(%rsp), %ymm1, %ymm12
+vpaddd %ymm2, %ymm7, %ymm2
+vpaddd %ymm3, %ymm4, %ymm3
+vpxor %ymm13, %ymm2, %ymm13
+vpxor %ymm14, %ymm3, %ymm14
+vpshufb 448(%rsp), %ymm15, %ymm15
+vpshufb 448(%rsp), %ymm12, %ymm12
+vpaddd %ymm10, %ymm15, %ymm10
+vpaddd %ymm11, %ymm12, %ymm11
+vpshufb 448(%rsp), %ymm13, %ymm13
+vpshufb 448(%rsp), %ymm14, %ymm14
+vpaddd %ymm8, %ymm13, %ymm8
+vpaddd %ymm9, %ymm14, %ymm9
+vmovdqa %ymm15, 96(%rsp)
+vpxor %ymm5, %ymm10, %ymm5
+vpxor %ymm6, %ymm11, %ymm6
+vpslld $ 12, %ymm5, %ymm15
+vpsrld $20, %ymm5, %ymm5
+vpxor %ymm5, %ymm15, %ymm5
+vpslld $ 12, %ymm6, %ymm15
+vpsrld $20, %ymm6, %ymm6
+vpxor %ymm6, %ymm15, %ymm6
+vpxor %ymm7, %ymm8, %ymm7
+vpxor %ymm4, %ymm9, %ymm4
+vpslld $ 12, %ymm7, %ymm15
+vpsrld $20, %ymm7, %ymm7
+vpxor %ymm7, %ymm15, %ymm7
+vpslld $ 12, %ymm4, %ymm15
+vpsrld $20, %ymm4, %ymm4
+vpxor %ymm4, %ymm15, %ymm4
+vpaddd %ymm0, %ymm5, %ymm0
+vpaddd %ymm1, %ymm6, %ymm1
+vpxor 96(%rsp), %ymm0, %ymm15
+vpxor %ymm12, %ymm1, %ymm12
+vpaddd %ymm2, %ymm7, %ymm2
+vpaddd %ymm3, %ymm4, %ymm3
+vpxor %ymm13, %ymm2, %ymm13
+vpxor %ymm14, %ymm3, %ymm14
+vpshufb 480(%rsp), %ymm15, %ymm15
+vpshufb 480(%rsp), %ymm12, %ymm12
+vpaddd %ymm10, %ymm15, %ymm10
+vpaddd %ymm11, %ymm12, %ymm11
+vpshufb 480(%rsp), %ymm13, %ymm13
+vpshufb 480(%rsp), %ymm14, %ymm14
+vpaddd %ymm8, %ymm13, %ymm8
+vpaddd %ymm9, %ymm14, %ymm9
+vmovdqa %ymm15, 96(%rsp)
+vpxor %ymm5, %ymm10, %ymm5
+vpxor %ymm6, %ymm11, %ymm6
+vpslld $ 7, %ymm5, %ymm15
+vpsrld $25, %ymm5, %ymm5
+vpxor %ymm5, %ymm15, %ymm5
+vpslld $ 7, %ymm6, %ymm15
+vpsrld $25, %ymm6, %ymm6
+vpxor %ymm6, %ymm15, %ymm6
+vpxor %ymm7, %ymm8, %ymm7
+vpxor %ymm4, %ymm9, %ymm4
+vpslld $ 7, %ymm7, %ymm15
+vpsrld $25, %ymm7, %ymm7
+vpxor %ymm7, %ymm15, %ymm7
+vpslld $ 7, %ymm4, %ymm15
+vpsrld $25, %ymm4, %ymm4
+vpxor %ymm4, %ymm15, %ymm4
+vmovdqa 96(%rsp), %ymm15
+subq $2, %rax
+jnz chacha_blocks_avx2_mainloop1
+vmovdqa %ymm8, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa %ymm10, 256(%rsp)
+vmovdqa %ymm11, 288(%rsp)
+vmovdqa %ymm12, 320(%rsp)
+vmovdqa %ymm13, 352(%rsp)
+vmovdqa %ymm14, 384(%rsp)
+vmovdqa %ymm15, 416(%rsp)
+vpbroadcastd 0(%rsp), %ymm8
+vpbroadcastd 4+0(%rsp), %ymm9
+vpbroadcastd 8+0(%rsp), %ymm10
+vpbroadcastd 12+0(%rsp), %ymm11
+vpbroadcastd 16(%rsp), %ymm12
+vpbroadcastd 4+16(%rsp), %ymm13
+vpbroadcastd 8+16(%rsp), %ymm14
+vpbroadcastd 12+16(%rsp), %ymm15
+vpaddd %ymm8, %ymm0, %ymm0
+vpaddd %ymm9, %ymm1, %ymm1
+vpaddd %ymm10, %ymm2, %ymm2
+vpaddd %ymm11, %ymm3, %ymm3
+vpaddd %ymm12, %ymm4, %ymm4
+vpaddd %ymm13, %ymm5, %ymm5
+vpaddd %ymm14, %ymm6, %ymm6
+vpaddd %ymm15, %ymm7, %ymm7
+vpunpckldq %ymm1, %ymm0, %ymm8
+vpunpckldq %ymm3, %ymm2, %ymm9
+vpunpckhdq %ymm1, %ymm0, %ymm12
+vpunpckhdq %ymm3, %ymm2, %ymm13
+vpunpckldq %ymm5, %ymm4, %ymm10
+vpunpckldq %ymm7, %ymm6, %ymm11
+vpunpckhdq %ymm5, %ymm4, %ymm14
+vpunpckhdq %ymm7, %ymm6, %ymm15
+vpunpcklqdq %ymm9, %ymm8, %ymm0
+vpunpcklqdq %ymm11, %ymm10, %ymm1
+vpunpckhqdq %ymm9, %ymm8, %ymm2
+vpunpckhqdq %ymm11, %ymm10, %ymm3
+vpunpcklqdq %ymm13, %ymm12, %ymm4
+vpunpcklqdq %ymm15, %ymm14, %ymm5
+vpunpckhqdq %ymm13, %ymm12, %ymm6
+vpunpckhqdq %ymm15, %ymm14, %ymm7
+vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+andq %rsi, %rsi
+jz chacha_blocks_avx2_noinput1
+vpxor 0(%rsi), %ymm8, %ymm8
+vpxor 64(%rsi), %ymm9, %ymm9
+vpxor 128(%rsi), %ymm10, %ymm10
+vpxor 192(%rsi), %ymm11, %ymm11
+vpxor 256(%rsi), %ymm12, %ymm12
+vpxor 320(%rsi), %ymm13, %ymm13
+vpxor 384(%rsi), %ymm14, %ymm14
+vpxor 448(%rsi), %ymm15, %ymm15
+vmovdqu %ymm8, 0(%rdx)
+vmovdqu %ymm9, 64(%rdx)
+vmovdqu %ymm10, 128(%rdx)
+vmovdqu %ymm11, 192(%rdx)
+vmovdqu %ymm12, 256(%rdx)
+vmovdqu %ymm13, 320(%rdx)
+vmovdqu %ymm14, 384(%rdx)
+vmovdqu %ymm15, 448(%rdx)
+vmovdqa 192(%rsp), %ymm0
+vmovdqa 224(%rsp), %ymm1
+vmovdqa 256(%rsp), %ymm2
+vmovdqa 288(%rsp), %ymm3
+vmovdqa 320(%rsp), %ymm4
+vmovdqa 352(%rsp), %ymm5
+vmovdqa 384(%rsp), %ymm6
+vmovdqa 416(%rsp), %ymm7
+vpbroadcastd 32(%rsp), %ymm8
+vpbroadcastd 4+32(%rsp), %ymm9
+vpbroadcastd 8+32(%rsp), %ymm10
+vpbroadcastd 12+32(%rsp), %ymm11
+vmovdqa 128(%rsp), %ymm12
+vmovdqa 160(%rsp), %ymm13
+vpbroadcastd 8+48(%rsp), %ymm14
+vpbroadcastd 12+48(%rsp), %ymm15
+vpaddd %ymm8, %ymm0, %ymm0
+vpaddd %ymm9, %ymm1, %ymm1
+vpaddd %ymm10, %ymm2, %ymm2
+vpaddd %ymm11, %ymm3, %ymm3
+vpaddd %ymm12, %ymm4, %ymm4
+vpaddd %ymm13, %ymm5, %ymm5
+vpaddd %ymm14, %ymm6, %ymm6
+vpaddd %ymm15, %ymm7, %ymm7
+vpunpckldq %ymm1, %ymm0, %ymm8
+vpunpckldq %ymm3, %ymm2, %ymm9
+vpunpckhdq %ymm1, %ymm0, %ymm12
+vpunpckhdq %ymm3, %ymm2, %ymm13
+vpunpckldq %ymm5, %ymm4, %ymm10
+vpunpckldq %ymm7, %ymm6, %ymm11
+vpunpckhdq %ymm5, %ymm4, %ymm14
+vpunpckhdq %ymm7, %ymm6, %ymm15
+vpunpcklqdq %ymm9, %ymm8, %ymm0
+vpunpcklqdq %ymm11, %ymm10, %ymm1
+vpunpckhqdq %ymm9, %ymm8, %ymm2
+vpunpckhqdq %ymm11, %ymm10, %ymm3
+vpunpcklqdq %ymm13, %ymm12, %ymm4
+vpunpcklqdq %ymm15, %ymm14, %ymm5
+vpunpckhqdq %ymm13, %ymm12, %ymm6
+vpunpckhqdq %ymm15, %ymm14, %ymm7
+vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+vpxor 32(%rsi), %ymm8, %ymm8
+vpxor 96(%rsi), %ymm9, %ymm9
+vpxor 160(%rsi), %ymm10, %ymm10
+vpxor 224(%rsi), %ymm11, %ymm11
+vpxor 288(%rsi), %ymm12, %ymm12
+vpxor 352(%rsi), %ymm13, %ymm13
+vpxor 416(%rsi), %ymm14, %ymm14
+vpxor 480(%rsi), %ymm15, %ymm15
+vmovdqu %ymm8, 32(%rdx)
+vmovdqu %ymm9, 96(%rdx)
+vmovdqu %ymm10, 160(%rdx)
+vmovdqu %ymm11, 224(%rdx)
+vmovdqu %ymm12, 288(%rdx)
+vmovdqu %ymm13, 352(%rdx)
+vmovdqu %ymm14, 416(%rdx)
+vmovdqu %ymm15, 480(%rdx)
+addq $512, %rsi
+jmp chacha_blocks_avx2_mainloop1_cont
+chacha_blocks_avx2_noinput1:
+vmovdqu %ymm8, 0(%rdx)
+vmovdqu %ymm9, 64(%rdx)
+vmovdqu %ymm10, 128(%rdx)
+vmovdqu %ymm11, 192(%rdx)
+vmovdqu %ymm12, 256(%rdx)
+vmovdqu %ymm13, 320(%rdx)
+vmovdqu %ymm14, 384(%rdx)
+vmovdqu %ymm15, 448(%rdx)
+vmovdqa 192(%rsp), %ymm0
+vmovdqa 224(%rsp), %ymm1
+vmovdqa 256(%rsp), %ymm2
+vmovdqa 288(%rsp), %ymm3
+vmovdqa 320(%rsp), %ymm4
+vmovdqa 352(%rsp), %ymm5
+vmovdqa 384(%rsp), %ymm6
+vmovdqa 416(%rsp), %ymm7
+vpbroadcastd 32(%rsp), %ymm8
+vpbroadcastd 4+32(%rsp), %ymm9
+vpbroadcastd 8+32(%rsp), %ymm10
+vpbroadcastd 12+32(%rsp), %ymm11
+vmovdqa 128(%rsp), %ymm12
+vmovdqa 160(%rsp), %ymm13
+vpbroadcastd 8+48(%rsp), %ymm14
+vpbroadcastd 12+48(%rsp), %ymm15
+vpaddd %ymm8, %ymm0, %ymm0
+vpaddd %ymm9, %ymm1, %ymm1
+vpaddd %ymm10, %ymm2, %ymm2
+vpaddd %ymm11, %ymm3, %ymm3
+vpaddd %ymm12, %ymm4, %ymm4
+vpaddd %ymm13, %ymm5, %ymm5
+vpaddd %ymm14, %ymm6, %ymm6
+vpaddd %ymm15, %ymm7, %ymm7
+vpunpckldq %ymm1, %ymm0, %ymm8
+vpunpckldq %ymm3, %ymm2, %ymm9
+vpunpckhdq %ymm1, %ymm0, %ymm12
+vpunpckhdq %ymm3, %ymm2, %ymm13
+vpunpckldq %ymm5, %ymm4, %ymm10
+vpunpckldq %ymm7, %ymm6, %ymm11
+vpunpckhdq %ymm5, %ymm4, %ymm14
+vpunpckhdq %ymm7, %ymm6, %ymm15
+vpunpcklqdq %ymm9, %ymm8, %ymm0
+vpunpcklqdq %ymm11, %ymm10, %ymm1
+vpunpckhqdq %ymm9, %ymm8, %ymm2
+vpunpckhqdq %ymm11, %ymm10, %ymm3
+vpunpcklqdq %ymm13, %ymm12, %ymm4
+vpunpcklqdq %ymm15, %ymm14, %ymm5
+vpunpckhqdq %ymm13, %ymm12, %ymm6
+vpunpckhqdq %ymm15, %ymm14, %ymm7
+vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+vmovdqu %ymm8, 32(%rdx)
+vmovdqu %ymm9, 96(%rdx)
+vmovdqu %ymm10, 160(%rdx)
+vmovdqu %ymm11, 224(%rdx)
+vmovdqu %ymm12, 288(%rdx)
+vmovdqu %ymm13, 352(%rdx)
+vmovdqu %ymm14, 416(%rdx)
+vmovdqu %ymm15, 480(%rdx)
+chacha_blocks_avx2_mainloop1_cont:
+addq $512, %rdx
+subq $512, %rcx
+cmp $512, %rcx
+jae chacha_blocks_avx2_atleast512
+cmp $256, %rcx
+jb chacha_blocks_avx2_below256_fixup
+chacha_blocks_avx2_atleast256:
+movq 48(%rsp), %rax
+leaq 1(%rax), %r8
+leaq 2(%rax), %r9
+leaq 3(%rax), %r10
+leaq 4(%rax), %rbx
+movl %eax, 128(%rsp)
+movl %r8d, 4+128(%rsp)
+movl %r9d, 8+128(%rsp)
+movl %r10d, 12+128(%rsp)
+shrq $32, %rax
+shrq $32, %r8
+shrq $32, %r9
+shrq $32, %r10
+movl %eax, 160(%rsp)
+movl %r8d, 4+160(%rsp)
+movl %r9d, 8+160(%rsp)
+movl %r10d, 12+160(%rsp)
+movq %rbx, 48(%rsp)
+movq 64(%rsp), %rax
+vpbroadcastd 0(%rsp), %xmm0
+vpbroadcastd 4+0(%rsp), %xmm1
+vpbroadcastd 8+0(%rsp), %xmm2
+vpbroadcastd 12+0(%rsp), %xmm3
+vpbroadcastd 16(%rsp), %xmm4
+vpbroadcastd 4+16(%rsp), %xmm5
+vpbroadcastd 8+16(%rsp), %xmm6
+vpbroadcastd 12+16(%rsp), %xmm7
+vpbroadcastd 32(%rsp), %xmm8
+vpbroadcastd 4+32(%rsp), %xmm9
+vpbroadcastd 8+32(%rsp), %xmm10
+vpbroadcastd 12+32(%rsp), %xmm11
+vmovdqa 128(%rsp), %xmm12
+vmovdqa 160(%rsp), %xmm13
+vpbroadcastd 8+48(%rsp), %xmm14
+vpbroadcastd 12+48(%rsp), %xmm15
+chacha_blocks_avx2_mainloop2:
+vpaddd %xmm0, %xmm4, %xmm0
+vpaddd %xmm1, %xmm5, %xmm1
+vpxor %xmm12, %xmm0, %xmm12
+vpxor %xmm13, %xmm1, %xmm13
+vpaddd %xmm2, %xmm6, %xmm2
+vpaddd %xmm3, %xmm7, %xmm3
+vpxor %xmm14, %xmm2, %xmm14
+vpxor %xmm15, %xmm3, %xmm15
+vpshufb 448(%rsp), %xmm12, %xmm12
+vpshufb 448(%rsp), %xmm13, %xmm13
+vpaddd %xmm8, %xmm12, %xmm8
+vpaddd %xmm9, %xmm13, %xmm9
+vpshufb 448(%rsp), %xmm14, %xmm14
+vpshufb 448(%rsp), %xmm15, %xmm15
+vpaddd %xmm10, %xmm14, %xmm10
+vpaddd %xmm11, %xmm15, %xmm11
+vmovdqa %xmm12, 96(%rsp)
+vpxor %xmm4, %xmm8, %xmm4
+vpxor %xmm5, %xmm9, %xmm5
+vpslld $ 12, %xmm4, %xmm12
+vpsrld $20, %xmm4, %xmm4
+vpxor %xmm4, %xmm12, %xmm4
+vpslld $ 12, %xmm5, %xmm12
+vpsrld $20, %xmm5, %xmm5
+vpxor %xmm5, %xmm12, %xmm5
+vpxor %xmm6, %xmm10, %xmm6
+vpxor %xmm7, %xmm11, %xmm7
+vpslld $ 12, %xmm6, %xmm12
+vpsrld $20, %xmm6, %xmm6
+vpxor %xmm6, %xmm12, %xmm6
+vpslld $ 12, %xmm7, %xmm12
+vpsrld $20, %xmm7, %xmm7
+vpxor %xmm7, %xmm12, %xmm7
+vpaddd %xmm0, %xmm4, %xmm0
+vpaddd %xmm1, %xmm5, %xmm1
+vpxor 96(%rsp), %xmm0, %xmm12
+vpxor %xmm13, %xmm1, %xmm13
+vpaddd %xmm2, %xmm6, %xmm2
+vpaddd %xmm3, %xmm7, %xmm3
+vpxor %xmm14, %xmm2, %xmm14
+vpxor %xmm15, %xmm3, %xmm15
+vpshufb 480(%rsp), %xmm12, %xmm12
+vpshufb 480(%rsp), %xmm13, %xmm13
+vpaddd %xmm8, %xmm12, %xmm8
+vpaddd %xmm9, %xmm13, %xmm9
+vpshufb 480(%rsp), %xmm14, %xmm14
+vpshufb 480(%rsp), %xmm15, %xmm15
+vpaddd %xmm10, %xmm14, %xmm10
+vpaddd %xmm11, %xmm15, %xmm11
+vmovdqa %xmm12, 96(%rsp)
+vpxor %xmm4, %xmm8, %xmm4
+vpxor %xmm5, %xmm9, %xmm5
+vpslld $ 7, %xmm4, %xmm12
+vpsrld $25, %xmm4, %xmm4
+vpxor %xmm4, %xmm12, %xmm4
+vpslld $ 7, %xmm5, %xmm12
+vpsrld $25, %xmm5, %xmm5
+vpxor %xmm5, %xmm12, %xmm5
+vpxor %xmm6, %xmm10, %xmm6
+vpxor %xmm7, %xmm11, %xmm7
+vpslld $ 7, %xmm6, %xmm12
+vpsrld $25, %xmm6, %xmm6
+vpxor %xmm6, %xmm12, %xmm6
+vpslld $ 7, %xmm7, %xmm12
+vpsrld $25, %xmm7, %xmm7
+vpxor %xmm7, %xmm12, %xmm7
+vpaddd %xmm0, %xmm5, %xmm0
+vpaddd %xmm1, %xmm6, %xmm1
+vpxor %xmm15, %xmm0, %xmm15
+vpxor 96(%rsp), %xmm1, %xmm12
+vpaddd %xmm2, %xmm7, %xmm2
+vpaddd %xmm3, %xmm4, %xmm3
+vpxor %xmm13, %xmm2, %xmm13
+vpxor %xmm14, %xmm3, %xmm14
+vpshufb 448(%rsp), %xmm15, %xmm15
+vpshufb 448(%rsp), %xmm12, %xmm12
+vpaddd %xmm10, %xmm15, %xmm10
+vpaddd %xmm11, %xmm12, %xmm11
+vpshufb 448(%rsp), %xmm13, %xmm13
+vpshufb 448(%rsp), %xmm14, %xmm14
+vpaddd %xmm8, %xmm13, %xmm8
+vpaddd %xmm9, %xmm14, %xmm9
+vmovdqa %xmm15, 96(%rsp)
+vpxor %xmm5, %xmm10, %xmm5
+vpxor %xmm6, %xmm11, %xmm6
+vpslld $ 12, %xmm5, %xmm15
+vpsrld $20, %xmm5, %xmm5
+vpxor %xmm5, %xmm15, %xmm5
+vpslld $ 12, %xmm6, %xmm15
+vpsrld $20, %xmm6, %xmm6
+vpxor %xmm6, %xmm15, %xmm6
+vpxor %xmm7, %xmm8, %xmm7
+vpxor %xmm4, %xmm9, %xmm4
+vpslld $ 12, %xmm7, %xmm15
+vpsrld $20, %xmm7, %xmm7
+vpxor %xmm7, %xmm15, %xmm7
+vpslld $ 12, %xmm4, %xmm15
+vpsrld $20, %xmm4, %xmm4
+vpxor %xmm4, %xmm15, %xmm4
+vpaddd %xmm0, %xmm5, %xmm0
+vpaddd %xmm1, %xmm6, %xmm1
+vpxor 96(%rsp), %xmm0, %xmm15
+vpxor %xmm12, %xmm1, %xmm12
+vpaddd %xmm2, %xmm7, %xmm2
+vpaddd %xmm3, %xmm4, %xmm3
+vpxor %xmm13, %xmm2, %xmm13
+vpxor %xmm14, %xmm3, %xmm14
+vpshufb 480(%rsp), %xmm15, %xmm15
+vpshufb 480(%rsp), %xmm12, %xmm12
+vpaddd %xmm10, %xmm15, %xmm10
+vpaddd %xmm11, %xmm12, %xmm11
+vpshufb 480(%rsp), %xmm13, %xmm13
+vpshufb 480(%rsp), %xmm14, %xmm14
+vpaddd %xmm8, %xmm13, %xmm8
+vpaddd %xmm9, %xmm14, %xmm9
+vmovdqa %xmm15, 96(%rsp)
+vpxor %xmm5, %xmm10, %xmm5
+vpxor %xmm6, %xmm11, %xmm6
+vpslld $ 7, %xmm5, %xmm15
+vpsrld $25, %xmm5, %xmm5
+vpxor %xmm5, %xmm15, %xmm5
+vpslld $ 7, %xmm6, %xmm15
+vpsrld $25, %xmm6, %xmm6
+vpxor %xmm6, %xmm15, %xmm6
+vpxor %xmm7, %xmm8, %xmm7
+vpxor %xmm4, %xmm9, %xmm4
+vpslld $ 7, %xmm7, %xmm15
+vpsrld $25, %xmm7, %xmm7
+vpxor %xmm7, %xmm15, %xmm7
+vpslld $ 7, %xmm4, %xmm15
+vpsrld $25, %xmm4, %xmm4
+vpxor %xmm4, %xmm15, %xmm4
+vmovdqa 96(%rsp), %xmm15
+subq $2, %rax
+jnz chacha_blocks_avx2_mainloop2
+vmovdqa %xmm8, 192(%rsp)
+vmovdqa %xmm9, 208(%rsp)
+vmovdqa %xmm10, 224(%rsp)
+vmovdqa %xmm11, 240(%rsp)
+vmovdqa %xmm12, 256(%rsp)
+vmovdqa %xmm13, 272(%rsp)
+vmovdqa %xmm14, 288(%rsp)
+vmovdqa %xmm15, 304(%rsp)
+vpbroadcastd 0(%rsp), %xmm8
+vpbroadcastd 4+0(%rsp), %xmm9
+vpbroadcastd 8+0(%rsp), %xmm10
+vpbroadcastd 12+0(%rsp), %xmm11
+vpbroadcastd 16(%rsp), %xmm12
+vpbroadcastd 4+16(%rsp), %xmm13
+vpbroadcastd 8+16(%rsp), %xmm14
+vpbroadcastd 12+16(%rsp), %xmm15
+vpaddd %xmm8, %xmm0, %xmm0
+vpaddd %xmm9, %xmm1, %xmm1
+vpaddd %xmm10, %xmm2, %xmm2
+vpaddd %xmm11, %xmm3, %xmm3
+vpaddd %xmm12, %xmm4, %xmm4
+vpaddd %xmm13, %xmm5, %xmm5
+vpaddd %xmm14, %xmm6, %xmm6
+vpaddd %xmm15, %xmm7, %xmm7
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+andq %rsi, %rsi
+jz chacha_blocks_avx2_noinput2
+vpxor 0(%rsi), %xmm0, %xmm0
+vpxor 16(%rsi), %xmm1, %xmm1
+vpxor 64(%rsi), %xmm2, %xmm2
+vpxor 80(%rsi), %xmm3, %xmm3
+vpxor 128(%rsi), %xmm4, %xmm4
+vpxor 144(%rsi), %xmm5, %xmm5
+vpxor 192(%rsi), %xmm6, %xmm6
+vpxor 208(%rsi), %xmm7, %xmm7
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 64(%rdx)
+vmovdqu %xmm3, 80(%rdx)
+vmovdqu %xmm4, 128(%rdx)
+vmovdqu %xmm5, 144(%rdx)
+vmovdqu %xmm6, 192(%rdx)
+vmovdqu %xmm7, 208(%rdx)
+vmovdqa 192(%rsp), %xmm0
+vmovdqa 208(%rsp), %xmm1
+vmovdqa 224(%rsp), %xmm2
+vmovdqa 240(%rsp), %xmm3
+vmovdqa 256(%rsp), %xmm4
+vmovdqa 272(%rsp), %xmm5
+vmovdqa 288(%rsp), %xmm6
+vmovdqa 304(%rsp), %xmm7
+vpbroadcastd 32(%rsp), %xmm8
+vpbroadcastd 4+32(%rsp), %xmm9
+vpbroadcastd 8+32(%rsp), %xmm10
+vpbroadcastd 12+32(%rsp), %xmm11
+vmovdqa 128(%rsp), %xmm12
+vmovdqa 160(%rsp), %xmm13
+vpbroadcastd 8+48(%rsp), %xmm14
+vpbroadcastd 12+48(%rsp), %xmm15
+vpaddd %xmm8, %xmm0, %xmm0
+vpaddd %xmm9, %xmm1, %xmm1
+vpaddd %xmm10, %xmm2, %xmm2
+vpaddd %xmm11, %xmm3, %xmm3
+vpaddd %xmm12, %xmm4, %xmm4
+vpaddd %xmm13, %xmm5, %xmm5
+vpaddd %xmm14, %xmm6, %xmm6
+vpaddd %xmm15, %xmm7, %xmm7
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+vpxor 32(%rsi), %xmm0, %xmm0
+vpxor 48(%rsi), %xmm1, %xmm1
+vpxor 96(%rsi), %xmm2, %xmm2
+vpxor 112(%rsi), %xmm3, %xmm3
+vpxor 160(%rsi), %xmm4, %xmm4
+vpxor 176(%rsi), %xmm5, %xmm5
+vpxor 224(%rsi), %xmm6, %xmm6
+vpxor 240(%rsi), %xmm7, %xmm7
+vmovdqu %xmm0, 32(%rdx)
+vmovdqu %xmm1, 48(%rdx)
+vmovdqu %xmm2, 96(%rdx)
+vmovdqu %xmm3, 112(%rdx)
+vmovdqu %xmm4, 160(%rdx)
+vmovdqu %xmm5, 176(%rdx)
+vmovdqu %xmm6, 224(%rdx)
+vmovdqu %xmm7, 240(%rdx)
+addq $256, %rsi
+jmp chacha_blocks_avx2_mainloop2_cont
+chacha_blocks_avx2_noinput2:
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 64(%rdx)
+vmovdqu %xmm3, 80(%rdx)
+vmovdqu %xmm4, 128(%rdx)
+vmovdqu %xmm5, 144(%rdx)
+vmovdqu %xmm6, 192(%rdx)
+vmovdqu %xmm7, 208(%rdx)
+vmovdqa 192(%rsp), %xmm0
+vmovdqa 208(%rsp), %xmm1
+vmovdqa 224(%rsp), %xmm2
+vmovdqa 240(%rsp), %xmm3
+vmovdqa 256(%rsp), %xmm4
+vmovdqa 272(%rsp), %xmm5
+vmovdqa 288(%rsp), %xmm6
+vmovdqa 304(%rsp), %xmm7
+vpbroadcastd 32(%rsp), %xmm8
+vpbroadcastd 4+32(%rsp), %xmm9
+vpbroadcastd 8+32(%rsp), %xmm10
+vpbroadcastd 12+32(%rsp), %xmm11
+vmovdqa 128(%rsp), %xmm12
+vmovdqa 160(%rsp), %xmm13
+vpbroadcastd 8+48(%rsp), %xmm14
+vpbroadcastd 12+48(%rsp), %xmm15
+vpaddd %xmm8, %xmm0, %xmm0
+vpaddd %xmm9, %xmm1, %xmm1
+vpaddd %xmm10, %xmm2, %xmm2
+vpaddd %xmm11, %xmm3, %xmm3
+vpaddd %xmm12, %xmm4, %xmm4
+vpaddd %xmm13, %xmm5, %xmm5
+vpaddd %xmm14, %xmm6, %xmm6
+vpaddd %xmm15, %xmm7, %xmm7
+vpunpckldq %xmm1, %xmm0, %xmm8
+vpunpckldq %xmm3, %xmm2, %xmm9
+vpunpckhdq %xmm1, %xmm0, %xmm12
+vpunpckhdq %xmm3, %xmm2, %xmm13
+vpunpckldq %xmm5, %xmm4, %xmm10
+vpunpckldq %xmm7, %xmm6, %xmm11
+vpunpckhdq %xmm5, %xmm4, %xmm14
+vpunpckhdq %xmm7, %xmm6, %xmm15
+vpunpcklqdq %xmm9, %xmm8, %xmm0
+vpunpcklqdq %xmm11, %xmm10, %xmm1
+vpunpckhqdq %xmm9, %xmm8, %xmm2
+vpunpckhqdq %xmm11, %xmm10, %xmm3
+vpunpcklqdq %xmm13, %xmm12, %xmm4
+vpunpcklqdq %xmm15, %xmm14, %xmm5
+vpunpckhqdq %xmm13, %xmm12, %xmm6
+vpunpckhqdq %xmm15, %xmm14, %xmm7
+vmovdqu %xmm0, 32(%rdx)
+vmovdqu %xmm1, 48(%rdx)
+vmovdqu %xmm2, 96(%rdx)
+vmovdqu %xmm3, 112(%rdx)
+vmovdqu %xmm4, 160(%rdx)
+vmovdqu %xmm5, 176(%rdx)
+vmovdqu %xmm6, 224(%rdx)
+vmovdqu %xmm7, 240(%rdx)
+chacha_blocks_avx2_mainloop2_cont:
+addq $256, %rdx
+subq $256, %rcx
+cmp $256, %rcx
+jae chacha_blocks_avx2_atleast256
+chacha_blocks_avx2_below256_fixup:
+vmovdqa 448(%rsp), %xmm6
+vmovdqa 480(%rsp), %xmm7
+vmovdqa 0(%rsp), %xmm8
+vmovdqa 16(%rsp), %xmm9
+vmovdqa 32(%rsp), %xmm10
+vmovdqa 48(%rsp), %xmm11
+movq $1, %r9
+chacha_blocks_avx2_below256:
+vmovq %r9, %xmm5
+andq %rcx, %rcx
+jz chacha_blocks_avx2_done
+cmpq $64, %rcx
+jae chacha_blocks_avx2_above63
+movq %rdx, %r9
+andq %rsi, %rsi
+jz chacha_blocks_avx2_noinput3
+movq %rcx, %r10
+movq %rsp, %rdx
+addq %r10, %rsi
+addq %r10, %rdx
+negq %r10
+chacha_blocks_avx2_copyinput:
+movb (%rsi, %r10), %al
+movb %al, (%rdx, %r10)
+incq %r10
+jnz chacha_blocks_avx2_copyinput
+movq %rsp, %rsi
+chacha_blocks_avx2_noinput3:
+movq %rsp, %rdx
+chacha_blocks_avx2_above63:
+vmovdqa %xmm8, %xmm0
+vmovdqa %xmm9, %xmm1
+vmovdqa %xmm10, %xmm2
+vmovdqa %xmm11, %xmm3
+movq 64(%rsp), %rax
+chacha_blocks_avx2_mainloop3:
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm7, %xmm3, %xmm3
+vpshufd $0x93, %xmm0, %xmm0
+vpaddd %xmm2, %xmm3, %xmm2
+vpshufd $0x4e, %xmm3, %xmm3
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x39, %xmm2, %xmm2
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm7, %xmm3, %xmm3
+vpshufd $0x39, %xmm0, %xmm0
+vpaddd %xmm2, %xmm3, %xmm2
+vpshufd $0x4e, %xmm3, %xmm3
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x93, %xmm2, %xmm2
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+subq $2, %rax
+jnz chacha_blocks_avx2_mainloop3
+vpaddd %xmm0, %xmm8, %xmm0
+vpaddd %xmm1, %xmm9, %xmm1
+vpaddd %xmm2, %xmm10, %xmm2
+vpaddd %xmm3, %xmm11, %xmm3
+andq %rsi, %rsi
+jz chacha_blocks_avx2_noinput4
+vpxor 0(%rsi), %xmm0, %xmm0
+vpxor 16(%rsi), %xmm1, %xmm1
+vpxor 32(%rsi), %xmm2, %xmm2
+vpxor 48(%rsi), %xmm3, %xmm3
+addq $64, %rsi
+chacha_blocks_avx2_noinput4:
+vmovdqu %xmm0, 0(%rdx)
+vmovdqu %xmm1, 16(%rdx)
+vmovdqu %xmm2, 32(%rdx)
+vmovdqu %xmm3, 48(%rdx)
+vpaddq %xmm11, %xmm5, %xmm11
+cmpq $64, %rcx
+jbe chacha_blocks_avx2_mainloop3_finishup
+addq $64, %rdx
+subq $64, %rcx
+jmp chacha_blocks_avx2_below256
+chacha_blocks_avx2_mainloop3_finishup:
+cmpq $64, %rcx
+je chacha_blocks_avx2_done
+addq %rcx, %r9
+addq %rcx, %rdx
+negq %rcx
+chacha_blocks_avx2_copyoutput:
+movb (%rdx, %rcx), %al
+movb %al, (%r9, %rcx)
+incq %rcx
+jnz chacha_blocks_avx2_copyoutput
+chacha_blocks_avx2_done:
+vmovdqu %xmm11, 32(%rdi)
+movq %rbp, %rsp
+popq %r14
+popq %r13
+popq %r12
+popq %rbp
+popq %rbx
+vzeroupper
+ret
+FN_END chacha_blocks_avx2
+
+
+GLOBAL_HIDDEN_FN hchacha_avx2
+hchacha_avx2_local:
+LOAD_VAR_PIC chacha_constants, %rax
+vmovdqa 0(%rax), %xmm0
+vmovdqa 16(%rax), %xmm6
+vmovdqa 32(%rax), %xmm5
+vmovdqu 0(%rdi), %xmm1
+vmovdqu 16(%rdi), %xmm2
+vmovdqu 0(%rsi), %xmm3
+hhacha_mainloop_avx2:
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm5, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $7, %xmm1, %xmm4
+vpsrld $25, %xmm1, %xmm1
+vpshufd $0x93, %xmm0, %xmm0
+vpxor %xmm1, %xmm4, %xmm1
+vpshufd $0x4e, %xmm3, %xmm3
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm6, %xmm3, %xmm3
+vpshufd $0x39, %xmm2, %xmm2
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpslld $12, %xmm1, %xmm4
+vpsrld $20, %xmm1, %xmm1
+vpxor %xmm1, %xmm4, %xmm1
+vpaddd %xmm0, %xmm1, %xmm0
+vpxor %xmm3, %xmm0, %xmm3
+vpshufb %xmm5, %xmm3, %xmm3
+vpaddd %xmm2, %xmm3, %xmm2
+vpxor %xmm1, %xmm2, %xmm1
+vpshufd $0x39, %xmm0, %xmm0
+vpslld $7, %xmm1, %xmm4
+vpshufd $0x4e, %xmm3, %xmm3
+vpsrld $25, %xmm1, %xmm1
+vpshufd $0x93, %xmm2, %xmm2
+vpxor %xmm1, %xmm4, %xmm1
+subl $2, %ecx
+jne hhacha_mainloop_avx2
+vmovdqu %xmm0, (%rdx)
+vmovdqu %xmm3, 16(%rdx)
+ret
+FN_END hchacha_avx2
+
+GLOBAL_HIDDEN_FN_EXT chacha_avx2, 6, 16
+pushq %rbp
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+vmovdqu 0(%rdi), %xmm0
+vmovdqu 16(%rdi), %xmm1
+vmovdqa %xmm0, 0(%rsp)
+vmovdqa %xmm1, 16(%rsp)
+xorq %rdi, %rdi
+movq %rdi, 32(%rsp)
+movq 0(%rsi), %rsi
+movq %rsi, 40(%rsp)
+movq %r9, 48(%rsp)
+movq %rsp, %rdi
+movq %rdx, %rsi
+movq %rcx, %rdx
+movq %r8, %rcx
+call chacha_blocks_avx2_local
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqa %xmm0, 0(%rsp)
+vmovdqa %xmm0, 16(%rsp)
+vmovdqa %xmm0, 32(%rsp)
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END chacha_avx2
+
+GLOBAL_HIDDEN_FN_EXT xchacha_avx2, 6, 16
+pushq %rbp
+pushq %rbx
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+movq %rsp, %rbx
+xorq %rax, %rax
+movq %rax, 32(%rbx)
+movq 16(%rsi), %rax
+movq %rax, 40(%rbx)
+movq %r9, 48(%rbx)
+pushq %rdx
+pushq %rcx
+pushq %r8
+movq %rbx, %rdx
+movq %r9, %rcx
+call hchacha_avx2_local
+movq %rbx, %rdi
+popq %rcx
+popq %rdx
+popq %rsi
+call chacha_blocks_avx2_local
+vpxor %xmm0, %xmm0, %xmm0
+vmovdqa %xmm0, 0(%rbx)
+vmovdqa %xmm0, 16(%rbx)
+vmovdqa %xmm0, 32(%rbx)
+movq %rbp, %rsp
+popq %rbx
+popq %rbp
+ret
+FN_END xchacha_avx2
diff --git a/src/libcryptobox/chacha20/chacha.c b/src/libcryptobox/chacha20/chacha.c
new file mode 100644
index 0000000..0b471c8
--- /dev/null
+++ b/src/libcryptobox/chacha20/chacha.c
@@ -0,0 +1,262 @@
+/* Copyright (c) 2015, Vsevolod Stakhov
+ * Copyright (c) 2015, Andrew Moon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "cryptobox.h"
+#include "chacha.h"
+#include "platform_config.h"
+
+extern unsigned cpu_config;
+
+typedef struct chacha_impl_t {
+ unsigned long cpu_flags;
+ const char *desc;
+ void (*chacha)(const chacha_key *key, const chacha_iv *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds);
+ void (*xchacha)(const chacha_key *key, const chacha_iv24 *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds);
+ void (*chacha_blocks)(chacha_state_internal *state,
+ const unsigned char *in, unsigned char *out, size_t bytes);
+ void (*hchacha)(const unsigned char key[32], const unsigned char iv[16],
+ unsigned char out[32], size_t rounds);
+} chacha_impl_t;
+
+#define CHACHA_DECLARE(ext) \
+ void chacha_##ext(const chacha_key *key, const chacha_iv *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds); \
+ void xchacha_##ext(const chacha_key *key, const chacha_iv24 *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds); \
+ void chacha_blocks_##ext(chacha_state_internal *state, const unsigned char *in, unsigned char *out, size_t bytes); \
+ void hchacha_##ext(const unsigned char key[32], const unsigned char iv[16], unsigned char out[32], size_t rounds);
+#define CHACHA_IMPL(cpuflags, desc, ext) \
+ { \
+ (cpuflags), desc, chacha_##ext, xchacha_##ext, chacha_blocks_##ext, hchacha_##ext \
+ }
+
+#if defined(HAVE_AVX2) && defined(__x86_64__)
+CHACHA_DECLARE(avx2)
+#define CHACHA_AVX2 CHACHA_IMPL(CPUID_AVX2, "avx2", avx2)
+#endif
+#if defined(HAVE_AVX) && defined(__x86_64__)
+CHACHA_DECLARE(avx)
+#define CHACHA_AVX CHACHA_IMPL(CPUID_AVX, "avx", avx)
+#endif
+#if defined(HAVE_SSE2) && defined(__x86_64__)
+CHACHA_DECLARE(sse2)
+#define CHACHA_SSE2 CHACHA_IMPL(CPUID_SSE2, "sse2", sse2)
+#endif
+
+CHACHA_DECLARE(ref)
+#define CHACHA_GENERIC CHACHA_IMPL(0, "generic", ref)
+
+static const chacha_impl_t chacha_list[] = {
+ CHACHA_GENERIC,
+#if defined(CHACHA_AVX2) && defined(__x86_64__)
+ CHACHA_AVX2,
+#endif
+#if defined(CHACHA_AVX) && defined(__x86_64__)
+ CHACHA_AVX,
+#endif
+#if defined(CHACHA_SSE2) && defined(__x86_64__)
+ CHACHA_SSE2
+#endif
+};
+
+static const chacha_impl_t *chacha_impl = &chacha_list[0];
+
+static int
+chacha_is_aligned(const void *p)
+{
+ return ((size_t) p & (sizeof(size_t) - 1)) == 0;
+}
+
+const char *
+chacha_load(void)
+{
+ guint i;
+
+ if (cpu_config != 0) {
+ for (i = 0; i < G_N_ELEMENTS(chacha_list); i++) {
+ if (chacha_list[i].cpu_flags & cpu_config) {
+ chacha_impl = &chacha_list[i];
+ break;
+ }
+ }
+ }
+
+ return chacha_impl->desc;
+}
+
+void chacha_init(chacha_state *S, const chacha_key *key,
+ const chacha_iv *iv, size_t rounds)
+{
+ chacha_state_internal *state = (chacha_state_internal *) S;
+ memcpy(state->s + 0, key, 32);
+ memset(state->s + 32, 0, 8);
+ memcpy(state->s + 40, iv, 8);
+ state->rounds = rounds;
+ state->leftover = 0;
+}
+
+/* processes inlen bytes (can do partial blocks), handling input/output alignment */
+static void
+chacha_consume(chacha_state_internal *state,
+ const unsigned char *in, unsigned char *out, size_t inlen)
+{
+ unsigned char buffer[16 * CHACHA_BLOCKBYTES];
+ int in_aligned, out_aligned;
+
+ /* it's ok to call with 0 bytes */
+ if (!inlen)
+ return;
+
+ /* if everything is aligned, handle directly */
+ in_aligned = chacha_is_aligned(in);
+ out_aligned = chacha_is_aligned(out);
+ if (in_aligned && out_aligned) {
+ chacha_impl->chacha_blocks(state, in, out, inlen);
+ return;
+ }
+
+ /* copy the unaligned data to an aligned buffer and process in chunks */
+ while (inlen) {
+ const size_t bytes = (inlen > sizeof(buffer)) ? sizeof(buffer) : inlen;
+ const unsigned char *src = in;
+ unsigned char *dst = (out_aligned) ? out : buffer;
+ if (!in_aligned) {
+ memcpy(buffer, in, bytes);
+ src = buffer;
+ }
+ chacha_impl->chacha_blocks(state, src, dst, bytes);
+ if (!out_aligned)
+ memcpy(out, buffer, bytes);
+ if (in)
+ in += bytes;
+ out += bytes;
+ inlen -= bytes;
+ }
+}
+
+/* hchacha */
+void hchacha(const unsigned char key[32],
+ const unsigned char iv[16], unsigned char out[32], size_t rounds)
+{
+ chacha_impl->hchacha(key, iv, out, rounds);
+}
+
+/* update, returns number of bytes written to out */
+size_t
+chacha_update(chacha_state *S, const unsigned char *in, unsigned char *out,
+ size_t inlen)
+{
+ chacha_state_internal *state = (chacha_state_internal *) S;
+ unsigned char *out_start = out;
+ size_t bytes;
+
+ /* enough for at least one block? */
+ while ((state->leftover + inlen) >= CHACHA_BLOCKBYTES) {
+ /* handle the previous data */
+ if (state->leftover) {
+ bytes = (CHACHA_BLOCKBYTES - state->leftover);
+ if (in) {
+ memcpy(state->buffer + state->leftover, in, bytes);
+ in += bytes;
+ }
+ chacha_consume(state, (in) ? state->buffer : NULL, out,
+ CHACHA_BLOCKBYTES);
+ inlen -= bytes;
+ out += CHACHA_BLOCKBYTES;
+ state->leftover = 0;
+ }
+
+ /* handle the direct data */
+ bytes = (inlen & ~(CHACHA_BLOCKBYTES - 1));
+ if (bytes) {
+ chacha_consume(state, in, out, bytes);
+ inlen -= bytes;
+ if (in)
+ in += bytes;
+ out += bytes;
+ }
+ }
+
+ /* handle leftover data */
+ if (inlen) {
+ if (in)
+ memcpy(state->buffer + state->leftover, in, inlen);
+ else
+ memset(state->buffer + state->leftover, 0, inlen);
+ state->leftover += inlen;
+ }
+
+ return out - out_start;
+}
+
+/* finalize, write out any leftover data */
+size_t
+chacha_final(chacha_state *S, unsigned char *out)
+{
+ chacha_state_internal *state = (chacha_state_internal *) S;
+ size_t leftover = state->leftover;
+ if (leftover) {
+ if (chacha_is_aligned(out)) {
+ chacha_impl->chacha_blocks(state, state->buffer, out, leftover);
+ }
+ else {
+ chacha_impl->chacha_blocks(state, state->buffer, state->buffer,
+ leftover);
+ memcpy(out, state->buffer, leftover);
+ }
+ }
+ rspamd_explicit_memzero(S, sizeof(chacha_state));
+ return leftover;
+}
+
+/* one-shot, input/output assumed to be word aligned */
+void chacha(const chacha_key *key, const chacha_iv *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds)
+{
+ chacha_impl->chacha(key, iv, in, out, inlen, rounds);
+}
+
+/*
+ xchacha, chacha with a 192 bit nonce
+ */
+
+void xchacha_init(chacha_state *S, const chacha_key *key,
+ const chacha_iv24 *iv, size_t rounds)
+{
+ chacha_key subkey;
+ hchacha(key->b, iv->b, subkey.b, rounds);
+ chacha_init(S, &subkey, (chacha_iv *) (iv->b + 16), rounds);
+}
+
+/* one-shot, input/output assumed to be word aligned */
+void xchacha(const chacha_key *key, const chacha_iv24 *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds)
+{
+ chacha_impl->xchacha(key, iv, in, out, inlen, rounds);
+}
diff --git a/src/libcryptobox/chacha20/chacha.h b/src/libcryptobox/chacha20/chacha.h
new file mode 100644
index 0000000..d05088a
--- /dev/null
+++ b/src/libcryptobox/chacha20/chacha.h
@@ -0,0 +1,87 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015 Andrew Moon, Vsevolod Stakhov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#ifndef CHACHA_H_
+#define CHACHA_H_
+
+
+#define CHACHA_BLOCKBYTES 64
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct chacha_state_internal_t {
+ unsigned char s[48];
+ size_t rounds;
+ size_t leftover;
+ unsigned char buffer[CHACHA_BLOCKBYTES];
+} chacha_state_internal;
+
+typedef struct chacha_state_t {
+ unsigned char opaque[128];
+} chacha_state;
+
+typedef struct chacha_key_t {
+ unsigned char b[32];
+} chacha_key;
+
+typedef struct chacha_iv_t {
+ unsigned char b[8];
+} chacha_iv;
+
+typedef struct chacha_iv24_t {
+ unsigned char b[24];
+} chacha_iv24;
+
+void hchacha(const unsigned char key[32], const unsigned char iv[16],
+ unsigned char out[32], size_t rounds);
+
+void chacha_init(chacha_state *S, const chacha_key *key, const chacha_iv *iv,
+ size_t rounds);
+
+void xchacha_init(chacha_state *S, const chacha_key *key,
+ const chacha_iv24 *iv, size_t rounds);
+
+size_t chacha_update(chacha_state *S, const unsigned char *in,
+ unsigned char *out, size_t inlen);
+
+size_t chacha_final(chacha_state *S, unsigned char *out);
+
+void chacha(const chacha_key *key, const chacha_iv *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds);
+
+void xchacha(const chacha_key *key, const chacha_iv24 *iv,
+ const unsigned char *in, unsigned char *out, size_t inlen,
+ size_t rounds);
+
+const char *chacha_load(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CHACHA_H_ */
diff --git a/src/libcryptobox/chacha20/constants.S b/src/libcryptobox/chacha20/constants.S
new file mode 100644
index 0000000..ff109a3
--- /dev/null
+++ b/src/libcryptobox/chacha20/constants.S
@@ -0,0 +1,6 @@
+SECTION_RODATA
+.p2align 4,,15
+chacha_constants:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 /* "expand 32-byte k" */
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 /* pshufb rotate by 16 */
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 /* pshufb rotate by 8 */
diff --git a/src/libcryptobox/chacha20/ref.c b/src/libcryptobox/chacha20/ref.c
new file mode 100644
index 0000000..ee646db
--- /dev/null
+++ b/src/libcryptobox/chacha20/ref.c
@@ -0,0 +1,272 @@
+#include "config.h"
+#include "chacha.h"
+#include "cryptobox.h"
+
+#if defined(HAVE_INT32)
+typedef uint32_t chacha_int32;
+#else
+typedef guint32 chacha_int32;
+#endif
+
+/* interpret four 8 bit unsigned integers as a 32 bit unsigned integer in little endian */
+static chacha_int32
+U8TO32(const unsigned char *p)
+{
+ return (((chacha_int32) (p[0])) |
+ ((chacha_int32) (p[1]) << 8) |
+ ((chacha_int32) (p[2]) << 16) |
+ ((chacha_int32) (p[3]) << 24));
+}
+
+/* store a 32 bit unsigned integer as four 8 bit unsigned integers in little endian */
+static void
+U32TO8(unsigned char *p, chacha_int32 v)
+{
+ p[0] = (v) &0xff;
+ p[1] = (v >> 8) & 0xff;
+ p[2] = (v >> 16) & 0xff;
+ p[3] = (v >> 24) & 0xff;
+}
+
+/* 32 bit left rotate */
+static chacha_int32
+ROTL32(chacha_int32 x, int k)
+{
+ return ((x << k) | (x >> (32 - k))) & 0xffffffff;
+}
+
+/* "expand 32-byte k", as 4 little endian 32-bit unsigned integers */
+static const chacha_int32 chacha_constants[4] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+
+void chacha_blocks_ref(chacha_state_internal *state, const unsigned char *in, unsigned char *out, size_t bytes)
+{
+ chacha_int32 x[16], j[12];
+ chacha_int32 t;
+ unsigned char *ctarget = out, tmp[64];
+ size_t i, r;
+
+ if (!bytes) return;
+
+ j[0] = U8TO32(state->s + 0);
+ j[1] = U8TO32(state->s + 4);
+ j[2] = U8TO32(state->s + 8);
+ j[3] = U8TO32(state->s + 12);
+ j[4] = U8TO32(state->s + 16);
+ j[5] = U8TO32(state->s + 20);
+ j[6] = U8TO32(state->s + 24);
+ j[7] = U8TO32(state->s + 28);
+ j[8] = U8TO32(state->s + 32);
+ j[9] = U8TO32(state->s + 36);
+ j[10] = U8TO32(state->s + 40);
+ j[11] = U8TO32(state->s + 44);
+
+ r = state->rounds;
+
+ for (;;) {
+ if (bytes < 64) {
+ if (in) {
+ for (i = 0; i < bytes; i++) tmp[i] = in[i];
+ in = tmp;
+ }
+ ctarget = out;
+ out = tmp;
+ }
+
+ x[0] = chacha_constants[0];
+ x[1] = chacha_constants[1];
+ x[2] = chacha_constants[2];
+ x[3] = chacha_constants[3];
+ x[4] = j[0];
+ x[5] = j[1];
+ x[6] = j[2];
+ x[7] = j[3];
+ x[8] = j[4];
+ x[9] = j[5];
+ x[10] = j[6];
+ x[11] = j[7];
+ x[12] = j[8];
+ x[13] = j[9];
+ x[14] = j[10];
+ x[15] = j[11];
+
+#define quarter(a, b, c, d) \
+ a += b; \
+ t = d ^ a; \
+ d = ROTL32(t, 16); \
+ c += d; \
+ t = b ^ c; \
+ b = ROTL32(t, 12); \
+ a += b; \
+ t = d ^ a; \
+ d = ROTL32(t, 8); \
+ c += d; \
+ t = b ^ c; \
+ b = ROTL32(t, 7);
+
+#define doubleround() \
+ quarter(x[0], x[4], x[8], x[12]) \
+ quarter(x[1], x[5], x[9], x[13]) \
+ quarter(x[2], x[6], x[10], x[14]) \
+ quarter(x[3], x[7], x[11], x[15]) \
+ quarter(x[0], x[5], x[10], x[15]) \
+ quarter(x[1], x[6], x[11], x[12]) \
+ quarter(x[2], x[7], x[8], x[13]) \
+ quarter(x[3], x[4], x[9], x[14])
+
+ i = r;
+ do {
+ doubleround()
+ i -= 2;
+ } while (i);
+
+ x[0] += chacha_constants[0];
+ x[1] += chacha_constants[1];
+ x[2] += chacha_constants[2];
+ x[3] += chacha_constants[3];
+ x[4] += j[0];
+ x[5] += j[1];
+ x[6] += j[2];
+ x[7] += j[3];
+ x[8] += j[4];
+ x[9] += j[5];
+ x[10] += j[6];
+ x[11] += j[7];
+ x[12] += j[8];
+ x[13] += j[9];
+ x[14] += j[10];
+ x[15] += j[11];
+
+ if (in) {
+ U32TO8(out + 0, x[0] ^ U8TO32(in + 0));
+ U32TO8(out + 4, x[1] ^ U8TO32(in + 4));
+ U32TO8(out + 8, x[2] ^ U8TO32(in + 8));
+ U32TO8(out + 12, x[3] ^ U8TO32(in + 12));
+ U32TO8(out + 16, x[4] ^ U8TO32(in + 16));
+ U32TO8(out + 20, x[5] ^ U8TO32(in + 20));
+ U32TO8(out + 24, x[6] ^ U8TO32(in + 24));
+ U32TO8(out + 28, x[7] ^ U8TO32(in + 28));
+ U32TO8(out + 32, x[8] ^ U8TO32(in + 32));
+ U32TO8(out + 36, x[9] ^ U8TO32(in + 36));
+ U32TO8(out + 40, x[10] ^ U8TO32(in + 40));
+ U32TO8(out + 44, x[11] ^ U8TO32(in + 44));
+ U32TO8(out + 48, x[12] ^ U8TO32(in + 48));
+ U32TO8(out + 52, x[13] ^ U8TO32(in + 52));
+ U32TO8(out + 56, x[14] ^ U8TO32(in + 56));
+ U32TO8(out + 60, x[15] ^ U8TO32(in + 60));
+ in += 64;
+ }
+ else {
+ U32TO8(out + 0, x[0]);
+ U32TO8(out + 4, x[1]);
+ U32TO8(out + 8, x[2]);
+ U32TO8(out + 12, x[3]);
+ U32TO8(out + 16, x[4]);
+ U32TO8(out + 20, x[5]);
+ U32TO8(out + 24, x[6]);
+ U32TO8(out + 28, x[7]);
+ U32TO8(out + 32, x[8]);
+ U32TO8(out + 36, x[9]);
+ U32TO8(out + 40, x[10]);
+ U32TO8(out + 44, x[11]);
+ U32TO8(out + 48, x[12]);
+ U32TO8(out + 52, x[13]);
+ U32TO8(out + 56, x[14]);
+ U32TO8(out + 60, x[15]);
+ }
+
+ /* increment the 64 bit counter, split in to two 32 bit halves */
+ j[8]++;
+ if (!j[8])
+ j[9]++;
+
+ if (bytes <= 64) {
+ if (bytes < 64)
+ for (i = 0; i < bytes; i++) ctarget[i] = out[i];
+
+ /* store the counter back to the state */
+ U32TO8(state->s + 32, j[8]);
+ U32TO8(state->s + 36, j[9]);
+ goto cleanup;
+ }
+ bytes -= 64;
+ out += 64;
+ }
+
+cleanup:
+ rspamd_explicit_memzero(j, sizeof(j));
+}
+
+void hchacha_ref(const unsigned char key[32], const unsigned char iv[16], unsigned char out[32], size_t rounds)
+{
+ chacha_int32 x[16];
+ chacha_int32 t;
+
+ x[0] = chacha_constants[0];
+ x[1] = chacha_constants[1];
+ x[2] = chacha_constants[2];
+ x[3] = chacha_constants[3];
+ x[4] = U8TO32(key + 0);
+ x[5] = U8TO32(key + 4);
+ x[6] = U8TO32(key + 8);
+ x[7] = U8TO32(key + 12);
+ x[8] = U8TO32(key + 16);
+ x[9] = U8TO32(key + 20);
+ x[10] = U8TO32(key + 24);
+ x[11] = U8TO32(key + 28);
+ x[12] = U8TO32(iv + 0);
+ x[13] = U8TO32(iv + 4);
+ x[14] = U8TO32(iv + 8);
+ x[15] = U8TO32(iv + 12);
+
+ do {
+ doubleround()
+ rounds -= 2;
+ } while (rounds);
+
+ /* indices for the chacha constant */
+ U32TO8(out + 0, x[0]);
+ U32TO8(out + 4, x[1]);
+ U32TO8(out + 8, x[2]);
+ U32TO8(out + 12, x[3]);
+
+ /* indices for the iv */
+ U32TO8(out + 16, x[12]);
+ U32TO8(out + 20, x[13]);
+ U32TO8(out + 24, x[14]);
+ U32TO8(out + 28, x[15]);
+}
+
+void chacha_clear_state_ref(chacha_state_internal *state)
+{
+ rspamd_explicit_memzero(state, 48);
+}
+
+void chacha_ref(const chacha_key *key, const chacha_iv *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds)
+{
+ chacha_state_internal state;
+ size_t i;
+ for (i = 0; i < 32; i++)
+ state.s[i + 0] = key->b[i];
+ for (i = 0; i < 8; i++)
+ state.s[i + 32] = 0;
+ for (i = 0; i < 8; i++)
+ state.s[i + 40] = iv->b[i];
+ state.rounds = rounds;
+ chacha_blocks_ref(&state, in, out, inlen);
+ chacha_clear_state_ref(&state);
+}
+
+void xchacha_ref(const chacha_key *key, const chacha_iv24 *iv, const unsigned char *in, unsigned char *out, size_t inlen, size_t rounds)
+{
+ chacha_state_internal state;
+ size_t i;
+ hchacha_ref(key->b, iv->b, &state.s[0], rounds);
+ for (i = 0; i < 8; i++)
+ state.s[i + 32] = 0;
+ for (i = 0; i < 8; i++)
+ state.s[i + 40] = iv->b[i + 16];
+ state.rounds = rounds;
+ chacha_blocks_ref(&state, in, out, inlen);
+ chacha_clear_state_ref(&state);
+}
diff --git a/src/libcryptobox/chacha20/sse2.S b/src/libcryptobox/chacha20/sse2.S
new file mode 100644
index 0000000..a91d095
--- /dev/null
+++ b/src/libcryptobox/chacha20/sse2.S
@@ -0,0 +1,734 @@
+#include "../macro.S"
+#include "constants.S"
+SECTION_TEXT
+
+GLOBAL_HIDDEN_FN chacha_blocks_sse2
+chacha_blocks_sse2_local:
+pushq %rbx
+pushq %rbp
+movq %rsp, %rbp
+andq $~63, %rsp
+subq $512, %rsp
+movq $0x3320646e61707865, %rax
+movq $0x6b20657479622d32, %r8
+movd %rax, %xmm8
+movd %r8, %xmm14
+punpcklqdq %xmm14, %xmm8
+movdqu 0(%rdi), %xmm9
+movdqu 16(%rdi), %xmm10
+movdqu 32(%rdi), %xmm11
+movq 48(%rdi), %rax
+movq $1, %r9
+movdqa %xmm8, 0(%rsp)
+movdqa %xmm9, 16(%rsp)
+movdqa %xmm10, 32(%rsp)
+movdqa %xmm11, 48(%rsp)
+movq %rax, 64(%rsp)
+cmpq $256, %rcx
+jb chacha_blocks_sse2_below256
+pshufd $0x00, %xmm8, %xmm0
+pshufd $0x55, %xmm8, %xmm1
+pshufd $0xaa, %xmm8, %xmm2
+pshufd $0xff, %xmm8, %xmm3
+movdqa %xmm0, 128(%rsp)
+movdqa %xmm1, 144(%rsp)
+movdqa %xmm2, 160(%rsp)
+movdqa %xmm3, 176(%rsp)
+pshufd $0x00, %xmm9, %xmm0
+pshufd $0x55, %xmm9, %xmm1
+pshufd $0xaa, %xmm9, %xmm2
+pshufd $0xff, %xmm9, %xmm3
+movdqa %xmm0, 192(%rsp)
+movdqa %xmm1, 208(%rsp)
+movdqa %xmm2, 224(%rsp)
+movdqa %xmm3, 240(%rsp)
+pshufd $0x00, %xmm10, %xmm0
+pshufd $0x55, %xmm10, %xmm1
+pshufd $0xaa, %xmm10, %xmm2
+pshufd $0xff, %xmm10, %xmm3
+movdqa %xmm0, 256(%rsp)
+movdqa %xmm1, 272(%rsp)
+movdqa %xmm2, 288(%rsp)
+movdqa %xmm3, 304(%rsp)
+pshufd $0xaa, %xmm11, %xmm0
+pshufd $0xff, %xmm11, %xmm1
+movdqa %xmm0, 352(%rsp)
+movdqa %xmm1, 368(%rsp)
+jmp chacha_blocks_sse2_atleast256
+.p2align 6,,63
+chacha_blocks_sse2_atleast256:
+movq 48(%rsp), %rax
+leaq 1(%rax), %r8
+leaq 2(%rax), %r9
+leaq 3(%rax), %r10
+leaq 4(%rax), %rbx
+movl %eax, 320(%rsp)
+movl %r8d, 4+320(%rsp)
+movl %r9d, 8+320(%rsp)
+movl %r10d, 12+320(%rsp)
+shrq $32, %rax
+shrq $32, %r8
+shrq $32, %r9
+shrq $32, %r10
+movl %eax, 336(%rsp)
+movl %r8d, 4+336(%rsp)
+movl %r9d, 8+336(%rsp)
+movl %r10d, 12+336(%rsp)
+movq %rbx, 48(%rsp)
+movq 64(%rsp), %rax
+movdqa 128(%rsp), %xmm0
+movdqa 144(%rsp), %xmm1
+movdqa 160(%rsp), %xmm2
+movdqa 176(%rsp), %xmm3
+movdqa 192(%rsp), %xmm4
+movdqa 208(%rsp), %xmm5
+movdqa 224(%rsp), %xmm6
+movdqa 240(%rsp), %xmm7
+movdqa 256(%rsp), %xmm8
+movdqa 272(%rsp), %xmm9
+movdqa 288(%rsp), %xmm10
+movdqa 304(%rsp), %xmm11
+movdqa 320(%rsp), %xmm12
+movdqa 336(%rsp), %xmm13
+movdqa 352(%rsp), %xmm14
+movdqa 368(%rsp), %xmm15
+chacha_blocks_sse2_mainloop1:
+paddd %xmm4, %xmm0
+paddd %xmm5, %xmm1
+pxor %xmm0, %xmm12
+pxor %xmm1, %xmm13
+paddd %xmm6, %xmm2
+paddd %xmm7, %xmm3
+movdqa %xmm6, 96(%rsp)
+pxor %xmm2, %xmm14
+pxor %xmm3, %xmm15
+pshuflw $0xb1,%xmm12,%xmm12
+pshufhw $0xb1,%xmm12,%xmm12
+pshuflw $0xb1,%xmm13,%xmm13
+pshufhw $0xb1,%xmm13,%xmm13
+pshuflw $0xb1,%xmm14,%xmm14
+pshufhw $0xb1,%xmm14,%xmm14
+pshuflw $0xb1,%xmm15,%xmm15
+pshufhw $0xb1,%xmm15,%xmm15
+paddd %xmm12, %xmm8
+paddd %xmm13, %xmm9
+paddd %xmm14, %xmm10
+paddd %xmm15, %xmm11
+movdqa %xmm12, 112(%rsp)
+pxor %xmm8, %xmm4
+pxor %xmm9, %xmm5
+movdqa 96(%rsp), %xmm6
+movdqa %xmm4, %xmm12
+pslld $ 12, %xmm4
+psrld $20, %xmm12
+pxor %xmm12, %xmm4
+movdqa %xmm5, %xmm12
+pslld $ 12, %xmm5
+psrld $20, %xmm12
+pxor %xmm12, %xmm5
+pxor %xmm10, %xmm6
+pxor %xmm11, %xmm7
+movdqa %xmm6, %xmm12
+pslld $ 12, %xmm6
+psrld $20, %xmm12
+pxor %xmm12, %xmm6
+movdqa %xmm7, %xmm12
+pslld $ 12, %xmm7
+psrld $20, %xmm12
+pxor %xmm12, %xmm7
+movdqa 112(%rsp), %xmm12
+paddd %xmm4, %xmm0
+paddd %xmm5, %xmm1
+pxor %xmm0, %xmm12
+pxor %xmm1, %xmm13
+paddd %xmm6, %xmm2
+paddd %xmm7, %xmm3
+movdqa %xmm6, 96(%rsp)
+pxor %xmm2, %xmm14
+pxor %xmm3, %xmm15
+movdqa %xmm12, %xmm6
+pslld $ 8, %xmm12
+psrld $24, %xmm6
+pxor %xmm6, %xmm12
+movdqa %xmm13, %xmm6
+pslld $ 8, %xmm13
+psrld $24, %xmm6
+pxor %xmm6, %xmm13
+paddd %xmm12, %xmm8
+paddd %xmm13, %xmm9
+movdqa %xmm14, %xmm6
+pslld $ 8, %xmm14
+psrld $24, %xmm6
+pxor %xmm6, %xmm14
+movdqa %xmm15, %xmm6
+pslld $ 8, %xmm15
+psrld $24, %xmm6
+pxor %xmm6, %xmm15
+paddd %xmm14, %xmm10
+paddd %xmm15, %xmm11
+movdqa %xmm12, 112(%rsp)
+pxor %xmm8, %xmm4
+pxor %xmm9, %xmm5
+movdqa 96(%rsp), %xmm6
+movdqa %xmm4, %xmm12
+pslld $ 7, %xmm4
+psrld $25, %xmm12
+pxor %xmm12, %xmm4
+movdqa %xmm5, %xmm12
+pslld $ 7, %xmm5
+psrld $25, %xmm12
+pxor %xmm12, %xmm5
+pxor %xmm10, %xmm6
+pxor %xmm11, %xmm7
+movdqa %xmm6, %xmm12
+pslld $ 7, %xmm6
+psrld $25, %xmm12
+pxor %xmm12, %xmm6
+movdqa %xmm7, %xmm12
+pslld $ 7, %xmm7
+psrld $25, %xmm12
+pxor %xmm12, %xmm7
+movdqa 112(%rsp), %xmm12
+paddd %xmm5, %xmm0
+paddd %xmm6, %xmm1
+pxor %xmm0, %xmm15
+pxor %xmm1, %xmm12
+paddd %xmm7, %xmm2
+paddd %xmm4, %xmm3
+movdqa %xmm7, 96(%rsp)
+pxor %xmm2, %xmm13
+pxor %xmm3, %xmm14
+pshuflw $0xb1,%xmm15,%xmm15
+pshufhw $0xb1,%xmm15,%xmm15
+pshuflw $0xb1,%xmm12,%xmm12
+pshufhw $0xb1,%xmm12,%xmm12
+pshuflw $0xb1,%xmm13,%xmm13
+pshufhw $0xb1,%xmm13,%xmm13
+pshuflw $0xb1,%xmm14,%xmm14
+pshufhw $0xb1,%xmm14,%xmm14
+paddd %xmm15, %xmm10
+paddd %xmm12, %xmm11
+paddd %xmm13, %xmm8
+paddd %xmm14, %xmm9
+movdqa %xmm15, 112(%rsp)
+pxor %xmm10, %xmm5
+pxor %xmm11, %xmm6
+movdqa 96(%rsp), %xmm7
+movdqa %xmm5, %xmm15
+pslld $ 12, %xmm5
+psrld $20, %xmm15
+pxor %xmm15, %xmm5
+movdqa %xmm6, %xmm15
+pslld $ 12, %xmm6
+psrld $20, %xmm15
+pxor %xmm15, %xmm6
+pxor %xmm8, %xmm7
+pxor %xmm9, %xmm4
+movdqa %xmm7, %xmm15
+pslld $ 12, %xmm7
+psrld $20, %xmm15
+pxor %xmm15, %xmm7
+movdqa %xmm4, %xmm15
+pslld $ 12, %xmm4
+psrld $20, %xmm15
+pxor %xmm15, %xmm4
+movdqa 112(%rsp), %xmm15
+paddd %xmm5, %xmm0
+paddd %xmm6, %xmm1
+pxor %xmm0, %xmm15
+pxor %xmm1, %xmm12
+paddd %xmm7, %xmm2
+paddd %xmm4, %xmm3
+movdqa %xmm7, 96(%rsp)
+pxor %xmm2, %xmm13
+pxor %xmm3, %xmm14
+movdqa %xmm15, %xmm7
+pslld $ 8, %xmm15
+psrld $24, %xmm7
+pxor %xmm7, %xmm15
+movdqa %xmm12, %xmm7
+pslld $ 8, %xmm12
+psrld $24, %xmm7
+pxor %xmm7, %xmm12
+paddd %xmm15, %xmm10
+paddd %xmm12, %xmm11
+movdqa %xmm13, %xmm7
+pslld $ 8, %xmm13
+psrld $24, %xmm7
+pxor %xmm7, %xmm13
+movdqa %xmm14, %xmm7
+pslld $ 8, %xmm14
+psrld $24, %xmm7
+pxor %xmm7, %xmm14
+paddd %xmm13, %xmm8
+paddd %xmm14, %xmm9
+movdqa %xmm15, 112(%rsp)
+pxor %xmm10, %xmm5
+pxor %xmm11, %xmm6
+movdqa 96(%rsp), %xmm7
+movdqa %xmm5, %xmm15
+pslld $ 7, %xmm5
+psrld $25, %xmm15
+pxor %xmm15, %xmm5
+movdqa %xmm6, %xmm15
+pslld $ 7, %xmm6
+psrld $25, %xmm15
+pxor %xmm15, %xmm6
+pxor %xmm8, %xmm7
+pxor %xmm9, %xmm4
+movdqa %xmm7, %xmm15
+pslld $ 7, %xmm7
+psrld $25, %xmm15
+pxor %xmm15, %xmm7
+movdqa %xmm4, %xmm15
+pslld $ 7, %xmm4
+psrld $25, %xmm15
+pxor %xmm15, %xmm4
+movdqa 112(%rsp), %xmm15
+subq $2, %rax
+jnz chacha_blocks_sse2_mainloop1
+paddd 128(%rsp), %xmm0
+paddd 144(%rsp), %xmm1
+paddd 160(%rsp), %xmm2
+paddd 176(%rsp), %xmm3
+paddd 192(%rsp), %xmm4
+paddd 208(%rsp), %xmm5
+paddd 224(%rsp), %xmm6
+paddd 240(%rsp), %xmm7
+paddd 256(%rsp), %xmm8
+paddd 272(%rsp), %xmm9
+paddd 288(%rsp), %xmm10
+paddd 304(%rsp), %xmm11
+paddd 320(%rsp), %xmm12
+paddd 336(%rsp), %xmm13
+paddd 352(%rsp), %xmm14
+paddd 368(%rsp), %xmm15
+movdqa %xmm8, 384(%rsp)
+movdqa %xmm9, 400(%rsp)
+movdqa %xmm10, 416(%rsp)
+movdqa %xmm11, 432(%rsp)
+movdqa %xmm12, 448(%rsp)
+movdqa %xmm13, 464(%rsp)
+movdqa %xmm14, 480(%rsp)
+movdqa %xmm15, 496(%rsp)
+movdqa %xmm0, %xmm8
+movdqa %xmm2, %xmm9
+movdqa %xmm4, %xmm10
+movdqa %xmm6, %xmm11
+punpckhdq %xmm1, %xmm0
+punpckhdq %xmm3, %xmm2
+punpckhdq %xmm5, %xmm4
+punpckhdq %xmm7, %xmm6
+punpckldq %xmm1, %xmm8
+punpckldq %xmm3, %xmm9
+punpckldq %xmm5, %xmm10
+punpckldq %xmm7, %xmm11
+movdqa %xmm0, %xmm1
+movdqa %xmm4, %xmm3
+movdqa %xmm8, %xmm5
+movdqa %xmm10, %xmm7
+punpckhqdq %xmm2, %xmm0
+punpckhqdq %xmm6, %xmm4
+punpckhqdq %xmm9, %xmm8
+punpckhqdq %xmm11, %xmm10
+punpcklqdq %xmm2, %xmm1
+punpcklqdq %xmm6, %xmm3
+punpcklqdq %xmm9, %xmm5
+punpcklqdq %xmm11, %xmm7
+andq %rsi, %rsi
+jz chacha_blocks_sse2_noinput1
+movdqu 0(%rsi), %xmm2
+movdqu 16(%rsi), %xmm6
+movdqu 64(%rsi), %xmm9
+movdqu 80(%rsi), %xmm11
+movdqu 128(%rsi), %xmm12
+movdqu 144(%rsi), %xmm13
+movdqu 192(%rsi), %xmm14
+movdqu 208(%rsi), %xmm15
+pxor %xmm2, %xmm5
+pxor %xmm6, %xmm7
+pxor %xmm9, %xmm8
+pxor %xmm11, %xmm10
+pxor %xmm12, %xmm1
+pxor %xmm13, %xmm3
+pxor %xmm14, %xmm0
+pxor %xmm15, %xmm4
+movdqu %xmm5, 0(%rdx)
+movdqu %xmm7, 16(%rdx)
+movdqu %xmm8, 64(%rdx)
+movdqu %xmm10, 80(%rdx)
+movdqu %xmm1, 128(%rdx)
+movdqu %xmm3, 144(%rdx)
+movdqu %xmm0, 192(%rdx)
+movdqu %xmm4, 208(%rdx)
+movdqa 384(%rsp), %xmm0
+movdqa 400(%rsp), %xmm1
+movdqa 416(%rsp), %xmm2
+movdqa 432(%rsp), %xmm3
+movdqa 448(%rsp), %xmm4
+movdqa 464(%rsp), %xmm5
+movdqa 480(%rsp), %xmm6
+movdqa 496(%rsp), %xmm7
+movdqa %xmm0, %xmm8
+movdqa %xmm2, %xmm9
+movdqa %xmm4, %xmm10
+movdqa %xmm6, %xmm11
+punpckldq %xmm1, %xmm8
+punpckldq %xmm3, %xmm9
+punpckhdq %xmm1, %xmm0
+punpckhdq %xmm3, %xmm2
+punpckldq %xmm5, %xmm10
+punpckldq %xmm7, %xmm11
+punpckhdq %xmm5, %xmm4
+punpckhdq %xmm7, %xmm6
+movdqa %xmm8, %xmm1
+movdqa %xmm0, %xmm3
+movdqa %xmm10, %xmm5
+movdqa %xmm4, %xmm7
+punpcklqdq %xmm9, %xmm1
+punpcklqdq %xmm11, %xmm5
+punpckhqdq %xmm9, %xmm8
+punpckhqdq %xmm11, %xmm10
+punpcklqdq %xmm2, %xmm3
+punpcklqdq %xmm6, %xmm7
+punpckhqdq %xmm2, %xmm0
+punpckhqdq %xmm6, %xmm4
+movdqu 32(%rsi), %xmm2
+movdqu 48(%rsi), %xmm6
+movdqu 96(%rsi), %xmm9
+movdqu 112(%rsi), %xmm11
+movdqu 160(%rsi), %xmm12
+movdqu 176(%rsi), %xmm13
+movdqu 224(%rsi), %xmm14
+movdqu 240(%rsi), %xmm15
+pxor %xmm2, %xmm1
+pxor %xmm6, %xmm5
+pxor %xmm9, %xmm8
+pxor %xmm11, %xmm10
+pxor %xmm12, %xmm3
+pxor %xmm13, %xmm7
+pxor %xmm14, %xmm0
+pxor %xmm15, %xmm4
+movdqu %xmm1, 32(%rdx)
+movdqu %xmm5, 48(%rdx)
+movdqu %xmm8, 96(%rdx)
+movdqu %xmm10, 112(%rdx)
+movdqu %xmm3, 160(%rdx)
+movdqu %xmm7, 176(%rdx)
+movdqu %xmm0, 224(%rdx)
+movdqu %xmm4, 240(%rdx)
+addq $256, %rsi
+jmp chacha_blocks_sse2_mainloop_cont
+chacha_blocks_sse2_noinput1:
+movdqu %xmm5, 0(%rdx)
+movdqu %xmm7, 16(%rdx)
+movdqu %xmm8, 64(%rdx)
+movdqu %xmm10, 80(%rdx)
+movdqu %xmm1, 128(%rdx)
+movdqu %xmm3, 144(%rdx)
+movdqu %xmm0, 192(%rdx)
+movdqu %xmm4, 208(%rdx)
+movdqa 384(%rsp), %xmm0
+movdqa 400(%rsp), %xmm1
+movdqa 416(%rsp), %xmm2
+movdqa 432(%rsp), %xmm3
+movdqa 448(%rsp), %xmm4
+movdqa 464(%rsp), %xmm5
+movdqa 480(%rsp), %xmm6
+movdqa 496(%rsp), %xmm7
+movdqa %xmm0, %xmm8
+movdqa %xmm2, %xmm9
+movdqa %xmm4, %xmm10
+movdqa %xmm6, %xmm11
+punpckldq %xmm1, %xmm8
+punpckldq %xmm3, %xmm9
+punpckhdq %xmm1, %xmm0
+punpckhdq %xmm3, %xmm2
+punpckldq %xmm5, %xmm10
+punpckldq %xmm7, %xmm11
+punpckhdq %xmm5, %xmm4
+punpckhdq %xmm7, %xmm6
+movdqa %xmm8, %xmm1
+movdqa %xmm0, %xmm3
+movdqa %xmm10, %xmm5
+movdqa %xmm4, %xmm7
+punpcklqdq %xmm9, %xmm1
+punpcklqdq %xmm11, %xmm5
+punpckhqdq %xmm9, %xmm8
+punpckhqdq %xmm11, %xmm10
+punpcklqdq %xmm2, %xmm3
+punpcklqdq %xmm6, %xmm7
+punpckhqdq %xmm2, %xmm0
+punpckhqdq %xmm6, %xmm4
+movdqu %xmm1, 32(%rdx)
+movdqu %xmm5, 48(%rdx)
+movdqu %xmm8, 96(%rdx)
+movdqu %xmm10, 112(%rdx)
+movdqu %xmm3, 160(%rdx)
+movdqu %xmm7, 176(%rdx)
+movdqu %xmm0, 224(%rdx)
+movdqu %xmm4, 240(%rdx)
+chacha_blocks_sse2_mainloop_cont:
+addq $256, %rdx
+subq $256, %rcx
+cmp $256, %rcx
+jae chacha_blocks_sse2_atleast256
+movdqa 0(%rsp), %xmm8
+movdqa 16(%rsp), %xmm9
+movdqa 32(%rsp), %xmm10
+movdqa 48(%rsp), %xmm11
+movq $1, %r9
+chacha_blocks_sse2_below256:
+movq %r9, %xmm5
+andq %rcx, %rcx
+jz chacha_blocks_sse2_done
+cmpq $64, %rcx
+jae chacha_blocks_sse2_above63
+movq %rdx, %r9
+andq %rsi, %rsi
+jz chacha_blocks_sse2_noinput2
+movq %rcx, %r10
+movq %rsp, %rdx
+addq %r10, %rsi
+addq %r10, %rdx
+negq %r10
+chacha_blocks_sse2_copyinput:
+movb (%rsi, %r10), %al
+movb %al, (%rdx, %r10)
+incq %r10
+jnz chacha_blocks_sse2_copyinput
+movq %rsp, %rsi
+chacha_blocks_sse2_noinput2:
+movq %rsp, %rdx
+chacha_blocks_sse2_above63:
+movdqa %xmm8, %xmm0
+movdqa %xmm9, %xmm1
+movdqa %xmm10, %xmm2
+movdqa %xmm11, %xmm3
+movq 64(%rsp), %rax
+chacha_blocks_sse2_mainloop2:
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+pshuflw $0xb1,%xmm3,%xmm3
+pshufhw $0xb1,%xmm3,%xmm3
+paddd %xmm3, %xmm2
+pxor %xmm2, %xmm1
+movdqa %xmm1,%xmm4
+pslld $12, %xmm1
+psrld $20, %xmm4
+pxor %xmm4, %xmm1
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+movdqa %xmm3,%xmm4
+pslld $8, %xmm3
+psrld $24, %xmm4
+pshufd $0x93,%xmm0,%xmm0
+pxor %xmm4, %xmm3
+paddd %xmm3, %xmm2
+pshufd $0x4e,%xmm3,%xmm3
+pxor %xmm2, %xmm1
+pshufd $0x39,%xmm2,%xmm2
+movdqa %xmm1,%xmm4
+pslld $7, %xmm1
+psrld $25, %xmm4
+pxor %xmm4, %xmm1
+subq $2, %rax
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+pshuflw $0xb1,%xmm3,%xmm3
+pshufhw $0xb1,%xmm3,%xmm3
+paddd %xmm3, %xmm2
+pxor %xmm2, %xmm1
+movdqa %xmm1,%xmm4
+pslld $12, %xmm1
+psrld $20, %xmm4
+pxor %xmm4, %xmm1
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+movdqa %xmm3,%xmm4
+pslld $8, %xmm3
+psrld $24, %xmm4
+pshufd $0x39,%xmm0,%xmm0
+pxor %xmm4, %xmm3
+paddd %xmm3, %xmm2
+pshufd $0x4e,%xmm3,%xmm3
+pxor %xmm2, %xmm1
+pshufd $0x93,%xmm2,%xmm2
+movdqa %xmm1,%xmm4
+pslld $7, %xmm1
+psrld $25, %xmm4
+pxor %xmm4, %xmm1
+jnz chacha_blocks_sse2_mainloop2
+paddd %xmm8, %xmm0
+paddd %xmm9, %xmm1
+paddd %xmm10, %xmm2
+paddd %xmm11, %xmm3
+andq %rsi, %rsi
+jz chacha_blocks_sse2_noinput3
+movdqu 0(%rsi), %xmm12
+movdqu 16(%rsi), %xmm13
+movdqu 32(%rsi), %xmm14
+movdqu 48(%rsi), %xmm15
+pxor %xmm12, %xmm0
+pxor %xmm13, %xmm1
+pxor %xmm14, %xmm2
+pxor %xmm15, %xmm3
+addq $64, %rsi
+chacha_blocks_sse2_noinput3:
+movdqu %xmm0, 0(%rdx)
+movdqu %xmm1, 16(%rdx)
+movdqu %xmm2, 32(%rdx)
+movdqu %xmm3, 48(%rdx)
+paddq %xmm5, %xmm11
+cmpq $64, %rcx
+jbe chacha_blocks_sse2_mainloop2_finishup
+addq $64, %rdx
+subq $64, %rcx
+jmp chacha_blocks_sse2_below256
+chacha_blocks_sse2_mainloop2_finishup:
+cmpq $64, %rcx
+je chacha_blocks_sse2_done
+addq %rcx, %r9
+addq %rcx, %rdx
+negq %rcx
+chacha_blocks_sse2_copyoutput:
+movb (%rdx, %rcx), %al
+movb %al, (%r9, %rcx)
+incq %rcx
+jnz chacha_blocks_sse2_copyoutput
+chacha_blocks_sse2_done:
+movdqu %xmm11, 32(%rdi)
+movq %rbp, %rsp
+popq %rbp
+popq %rbx
+ret
+FN_END chacha_blocks_sse2
+
+GLOBAL_HIDDEN_FN hchacha_sse2
+hchacha_sse2_local:
+movq $0x3320646e61707865, %rax
+movq $0x6b20657479622d32, %r8
+movd %rax, %xmm0
+movd %r8, %xmm4
+punpcklqdq %xmm4, %xmm0
+movdqu 0(%rdi), %xmm1
+movdqu 16(%rdi), %xmm2
+movdqu 0(%rsi), %xmm3
+hchacha_sse2_mainloop:
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+pshuflw $0xb1,%xmm3,%xmm3
+pshufhw $0xb1,%xmm3,%xmm3
+paddd %xmm3, %xmm2
+pxor %xmm2, %xmm1
+movdqa %xmm1,%xmm4
+pslld $12, %xmm1
+psrld $20, %xmm4
+pxor %xmm4, %xmm1
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+movdqa %xmm3,%xmm4
+pslld $8, %xmm3
+psrld $24, %xmm4
+pshufd $0x93,%xmm0,%xmm0
+pxor %xmm4, %xmm3
+paddd %xmm3, %xmm2
+pshufd $0x4e,%xmm3,%xmm3
+pxor %xmm2, %xmm1
+pshufd $0x39,%xmm2,%xmm2
+movdqa %xmm1,%xmm4
+pslld $7, %xmm1
+psrld $25, %xmm4
+pxor %xmm4, %xmm1
+subq $2, %rcx
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+pshuflw $0xb1,%xmm3,%xmm3
+pshufhw $0xb1,%xmm3,%xmm3
+paddd %xmm3, %xmm2
+pxor %xmm2, %xmm1
+movdqa %xmm1,%xmm4
+pslld $12, %xmm1
+psrld $20, %xmm4
+pxor %xmm4, %xmm1
+paddd %xmm1, %xmm0
+pxor %xmm0, %xmm3
+movdqa %xmm3,%xmm4
+pslld $8, %xmm3
+psrld $24, %xmm4
+pshufd $0x39,%xmm0,%xmm0
+pxor %xmm4, %xmm3
+paddd %xmm3, %xmm2
+pshufd $0x4e,%xmm3,%xmm3
+pxor %xmm2, %xmm1
+pshufd $0x93,%xmm2,%xmm2
+movdqa %xmm1,%xmm4
+pslld $7, %xmm1
+psrld $25, %xmm4
+pxor %xmm4, %xmm1
+ja hchacha_sse2_mainloop
+movdqu %xmm0, 0(%rdx)
+movdqu %xmm3, 16(%rdx)
+ret
+FN_END hchacha_sse2
+
+GLOBAL_HIDDEN_FN_EXT chacha_sse2, 6, 16
+pushq %rbp
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+movdqu 0(%rdi), %xmm0
+movdqu 16(%rdi), %xmm1
+movdqa %xmm0, 0(%rsp)
+movdqa %xmm1, 16(%rsp)
+xorq %rdi, %rdi
+movq %rdi, 32(%rsp)
+movq 0(%rsi), %rsi
+movq %rsi, 40(%rsp)
+movq %r9, 48(%rsp)
+movq %rsp, %rdi
+movq %rdx, %rsi
+movq %rcx, %rdx
+movq %r8, %rcx
+call chacha_blocks_sse2_local
+pxor %xmm0, %xmm0
+movdqa %xmm0, 0(%rsp)
+movdqa %xmm0, 16(%rsp)
+movdqa %xmm0, 32(%rsp)
+movq %rbp, %rsp
+popq %rbp
+ret
+FN_END chacha_sse2
+
+GLOBAL_HIDDEN_FN_EXT xchacha_sse2, 6, 16
+pushq %rbp
+pushq %rbx
+movq %rsp, %rbp
+subq $64, %rsp
+andq $~63, %rsp
+movq %rsp, %rbx
+xorq %rax, %rax
+movq %rax, 32(%rbx)
+movq 16(%rsi), %rax
+movq %rax, 40(%rbx)
+movq %r9, 48(%rbx)
+pushq %rdx
+pushq %rcx
+pushq %r8
+movq %rbx, %rdx
+movq %r9, %rcx
+call hchacha_sse2_local
+movq %rbx, %rdi
+popq %rcx
+popq %rdx
+popq %rsi
+call chacha_blocks_sse2_local
+pxor %xmm0, %xmm0
+movdqa %xmm0, 0(%rbx)
+movdqa %xmm0, 16(%rbx)
+movdqa %xmm0, 32(%rbx)
+movq %rbp, %rsp
+popq %rbx
+popq %rbp
+ret
+FN_END xchacha_sse2