diff options
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/salsa20-amd64.S')
-rw-r--r-- | comm/third_party/libgcrypt/cipher/salsa20-amd64.S | 940 |
1 files changed, 940 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/salsa20-amd64.S b/comm/third_party/libgcrypt/cipher/salsa20-amd64.S new file mode 100644 index 0000000000..ae8f27155a --- /dev/null +++ b/comm/third_party/libgcrypt/cipher/salsa20-amd64.S @@ -0,0 +1,940 @@ +/* salsa20-amd64.S - AMD64 implementation of Salsa20 + * + * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Based on public domain implementation by D. J. Bernstein at + * http://cr.yp.to/snuffle.html + */ + +#ifdef __x86_64 +#include <config.h> +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20) + +#include "asm-common-amd64.h" + +.text + +.align 8 +.globl _gcry_salsa20_amd64_keysetup +ELF(.type _gcry_salsa20_amd64_keysetup,@function;) +_gcry_salsa20_amd64_keysetup: + CFI_STARTPROC(); + movl 0(%rsi),%r8d + movl 4(%rsi),%r9d + movl 8(%rsi),%eax + movl 12(%rsi),%r10d + movl %r8d,20(%rdi) + movl %r9d,40(%rdi) + movl %eax,60(%rdi) + movl %r10d,48(%rdi) + cmp $256,%rdx + jb .L_kbits128 +.L_kbits256: + movl 16(%rsi),%edx + movl 20(%rsi),%ecx + movl 24(%rsi),%r8d + movl 28(%rsi),%esi + movl %edx,28(%rdi) + movl %ecx,16(%rdi) + movl %r8d,36(%rdi) + movl %esi,56(%rdi) + mov $1634760805,%rsi + mov $857760878,%rdx + mov $2036477234,%rcx + mov $1797285236,%r8 + movl %esi,0(%rdi) + movl %edx,4(%rdi) + movl %ecx,8(%rdi) + movl %r8d,12(%rdi) + jmp .L_keysetupdone +.L_kbits128: + movl 0(%rsi),%edx + movl 4(%rsi),%ecx + movl 8(%rsi),%r8d + movl 12(%rsi),%esi + movl %edx,28(%rdi) + movl %ecx,16(%rdi) + movl %r8d,36(%rdi) + movl %esi,56(%rdi) + mov $1634760805,%rsi + mov $824206446,%rdx + mov $2036477238,%rcx + mov $1797285236,%r8 + movl %esi,0(%rdi) + movl %edx,4(%rdi) + movl %ecx,8(%rdi) + movl %r8d,12(%rdi) +.L_keysetupdone: + ret + CFI_ENDPROC(); + +.align 8 +.globl _gcry_salsa20_amd64_ivsetup +ELF(.type _gcry_salsa20_amd64_ivsetup,@function;) +_gcry_salsa20_amd64_ivsetup: + CFI_STARTPROC(); + movl 0(%rsi),%r8d + movl 4(%rsi),%esi + mov $0,%r9 + mov $0,%rax + movl %r8d,24(%rdi) + movl %esi,44(%rdi) + movl %r9d,32(%rdi) + movl %eax,52(%rdi) + ret + CFI_ENDPROC(); + +.align 8 +.globl _gcry_salsa20_amd64_encrypt_blocks +ELF(.type _gcry_salsa20_amd64_encrypt_blocks,@function;) +_gcry_salsa20_amd64_encrypt_blocks: + /* + * Modifications to original implementation: + * - Number of rounds passing in register %r8 (for Salsa20/12). + * - Length is input as number of blocks, so don't handle tail bytes + * (this is done in salsa20.c). + */ + CFI_STARTPROC(); + push %rbx + CFI_PUSH(%rbx); + shlq $6, %rcx /* blocks to bytes */ + mov %r8, %rbx + mov %rsp,%r11 + CFI_DEF_CFA_REGISTER(%r11); + sub $384,%rsp + and $~31,%rsp + mov %rdi,%r8 + mov %rsi,%rsi + mov %rdx,%rdi + mov %rcx,%rdx + cmp $0,%rdx + jbe .L_done +.L_start: + cmp $256,%rdx + jb .L_bytes_are_64_128_or_192 + movdqa 0(%r8),%xmm0 + pshufd $0x55,%xmm0,%xmm1 + pshufd $0xaa,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm3 + pshufd $0x00,%xmm0,%xmm0 + movdqa %xmm1,0(%rsp) + movdqa %xmm2,16(%rsp) + movdqa %xmm3,32(%rsp) + movdqa %xmm0,48(%rsp) + movdqa 16(%r8),%xmm0 + pshufd $0xaa,%xmm0,%xmm1 + pshufd $0xff,%xmm0,%xmm2 + pshufd $0x00,%xmm0,%xmm3 + pshufd $0x55,%xmm0,%xmm0 + movdqa %xmm1,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm3,96(%rsp) + movdqa %xmm0,112(%rsp) + movdqa 32(%r8),%xmm0 + pshufd $0xff,%xmm0,%xmm1 + pshufd $0x55,%xmm0,%xmm2 + pshufd $0xaa,%xmm0,%xmm0 + movdqa %xmm1,128(%rsp) + movdqa %xmm2,144(%rsp) + movdqa %xmm0,160(%rsp) + movdqa 48(%r8),%xmm0 + pshufd $0x00,%xmm0,%xmm1 + pshufd $0xaa,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm0 + movdqa %xmm1,176(%rsp) + movdqa %xmm2,192(%rsp) + movdqa %xmm0,208(%rsp) +.L_bytesatleast256: + movl 32(%r8),%ecx + movl 52(%r8),%r9d + movl %ecx,224(%rsp) + movl %r9d,240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,4+224(%rsp) + movl %r9d,4+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,8+224(%rsp) + movl %r9d,8+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,12+224(%rsp) + movl %r9d,12+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,32(%r8) + movl %r9d,52(%r8) + movq %rdx,288(%rsp) + mov %rbx,%rdx + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + movdqa 192(%rsp),%xmm3 + movdqa 208(%rsp),%xmm4 + movdqa 64(%rsp),%xmm5 + movdqa 80(%rsp),%xmm6 + movdqa 112(%rsp),%xmm7 + movdqa 128(%rsp),%xmm8 + movdqa 144(%rsp),%xmm9 + movdqa 160(%rsp),%xmm10 + movdqa 240(%rsp),%xmm11 + movdqa 48(%rsp),%xmm12 + movdqa 96(%rsp),%xmm13 + movdqa 176(%rsp),%xmm14 + movdqa 224(%rsp),%xmm15 +.L_mainloop1: + movdqa %xmm1,256(%rsp) + movdqa %xmm2,272(%rsp) + movdqa %xmm13,%xmm1 + paddd %xmm12,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm14 + psrld $25,%xmm2 + pxor %xmm2,%xmm14 + movdqa %xmm7,%xmm1 + paddd %xmm0,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm11 + psrld $25,%xmm2 + pxor %xmm2,%xmm11 + movdqa %xmm12,%xmm1 + paddd %xmm14,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm15 + psrld $23,%xmm2 + pxor %xmm2,%xmm15 + movdqa %xmm0,%xmm1 + paddd %xmm11,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm9 + psrld $23,%xmm2 + pxor %xmm2,%xmm9 + movdqa %xmm14,%xmm1 + paddd %xmm15,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm13 + psrld $19,%xmm2 + pxor %xmm2,%xmm13 + movdqa %xmm11,%xmm1 + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm7 + psrld $19,%xmm2 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm1 + paddd %xmm13,%xmm1 + movdqa %xmm1,%xmm2 + pslld $18,%xmm1 + pxor %xmm1,%xmm12 + psrld $14,%xmm2 + pxor %xmm2,%xmm12 + movdqa 256(%rsp),%xmm1 + movdqa %xmm12,256(%rsp) + movdqa %xmm9,%xmm2 + paddd %xmm7,%xmm2 + movdqa %xmm2,%xmm12 + pslld $18,%xmm2 + pxor %xmm2,%xmm0 + psrld $14,%xmm12 + pxor %xmm12,%xmm0 + movdqa %xmm5,%xmm2 + paddd %xmm1,%xmm2 + movdqa %xmm2,%xmm12 + pslld $7,%xmm2 + pxor %xmm2,%xmm3 + psrld $25,%xmm12 + pxor %xmm12,%xmm3 + movdqa 272(%rsp),%xmm2 + movdqa %xmm0,272(%rsp) + movdqa %xmm6,%xmm0 + paddd %xmm2,%xmm0 + movdqa %xmm0,%xmm12 + pslld $7,%xmm0 + pxor %xmm0,%xmm4 + psrld $25,%xmm12 + pxor %xmm12,%xmm4 + movdqa %xmm1,%xmm0 + paddd %xmm3,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm10 + psrld $23,%xmm12 + pxor %xmm12,%xmm10 + movdqa %xmm2,%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm8 + psrld $23,%xmm12 + pxor %xmm12,%xmm8 + movdqa %xmm3,%xmm0 + paddd %xmm10,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm5 + psrld $19,%xmm12 + pxor %xmm12,%xmm5 + movdqa %xmm4,%xmm0 + paddd %xmm8,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm6 + psrld $19,%xmm12 + pxor %xmm12,%xmm6 + movdqa %xmm10,%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm1 + psrld $14,%xmm12 + pxor %xmm12,%xmm1 + movdqa 256(%rsp),%xmm0 + movdqa %xmm1,256(%rsp) + movdqa %xmm4,%xmm1 + paddd %xmm0,%xmm1 + movdqa %xmm1,%xmm12 + pslld $7,%xmm1 + pxor %xmm1,%xmm7 + psrld $25,%xmm12 + pxor %xmm12,%xmm7 + movdqa %xmm8,%xmm1 + paddd %xmm6,%xmm1 + movdqa %xmm1,%xmm12 + pslld $18,%xmm1 + pxor %xmm1,%xmm2 + psrld $14,%xmm12 + pxor %xmm12,%xmm2 + movdqa 272(%rsp),%xmm12 + movdqa %xmm2,272(%rsp) + movdqa %xmm14,%xmm1 + paddd %xmm12,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm5 + psrld $25,%xmm2 + pxor %xmm2,%xmm5 + movdqa %xmm0,%xmm1 + paddd %xmm7,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm10 + psrld $23,%xmm2 + pxor %xmm2,%xmm10 + movdqa %xmm12,%xmm1 + paddd %xmm5,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm8 + psrld $23,%xmm2 + pxor %xmm2,%xmm8 + movdqa %xmm7,%xmm1 + paddd %xmm10,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm4 + psrld $19,%xmm2 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm1 + paddd %xmm8,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm14 + psrld $19,%xmm2 + pxor %xmm2,%xmm14 + movdqa %xmm10,%xmm1 + paddd %xmm4,%xmm1 + movdqa %xmm1,%xmm2 + pslld $18,%xmm1 + pxor %xmm1,%xmm0 + psrld $14,%xmm2 + pxor %xmm2,%xmm0 + movdqa 256(%rsp),%xmm1 + movdqa %xmm0,256(%rsp) + movdqa %xmm8,%xmm0 + paddd %xmm14,%xmm0 + movdqa %xmm0,%xmm2 + pslld $18,%xmm0 + pxor %xmm0,%xmm12 + psrld $14,%xmm2 + pxor %xmm2,%xmm12 + movdqa %xmm11,%xmm0 + paddd %xmm1,%xmm0 + movdqa %xmm0,%xmm2 + pslld $7,%xmm0 + pxor %xmm0,%xmm6 + psrld $25,%xmm2 + pxor %xmm2,%xmm6 + movdqa 272(%rsp),%xmm2 + movdqa %xmm12,272(%rsp) + movdqa %xmm3,%xmm0 + paddd %xmm2,%xmm0 + movdqa %xmm0,%xmm12 + pslld $7,%xmm0 + pxor %xmm0,%xmm13 + psrld $25,%xmm12 + pxor %xmm12,%xmm13 + movdqa %xmm1,%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm15 + psrld $23,%xmm12 + pxor %xmm12,%xmm15 + movdqa %xmm2,%xmm0 + paddd %xmm13,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm9 + psrld $23,%xmm12 + pxor %xmm12,%xmm9 + movdqa %xmm6,%xmm0 + paddd %xmm15,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm11 + psrld $19,%xmm12 + pxor %xmm12,%xmm11 + movdqa %xmm13,%xmm0 + paddd %xmm9,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm3 + psrld $19,%xmm12 + pxor %xmm12,%xmm3 + movdqa %xmm15,%xmm0 + paddd %xmm11,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm1 + psrld $14,%xmm12 + pxor %xmm12,%xmm1 + movdqa %xmm9,%xmm0 + paddd %xmm3,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm2 + psrld $14,%xmm12 + pxor %xmm12,%xmm2 + movdqa 256(%rsp),%xmm12 + movdqa 272(%rsp),%xmm0 + sub $2,%rdx + ja .L_mainloop1 + paddd 48(%rsp),%xmm12 + paddd 112(%rsp),%xmm7 + paddd 160(%rsp),%xmm10 + paddd 208(%rsp),%xmm4 + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 0(%rsi),%edx + xorl 4(%rsi),%ecx + xorl 8(%rsi),%r9d + xorl 12(%rsi),%eax + movl %edx,0(%rdi) + movl %ecx,4(%rdi) + movl %r9d,8(%rdi) + movl %eax,12(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 64(%rsi),%edx + xorl 68(%rsi),%ecx + xorl 72(%rsi),%r9d + xorl 76(%rsi),%eax + movl %edx,64(%rdi) + movl %ecx,68(%rdi) + movl %r9d,72(%rdi) + movl %eax,76(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 128(%rsi),%edx + xorl 132(%rsi),%ecx + xorl 136(%rsi),%r9d + xorl 140(%rsi),%eax + movl %edx,128(%rdi) + movl %ecx,132(%rdi) + movl %r9d,136(%rdi) + movl %eax,140(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + xorl 192(%rsi),%edx + xorl 196(%rsi),%ecx + xorl 200(%rsi),%r9d + xorl 204(%rsi),%eax + movl %edx,192(%rdi) + movl %ecx,196(%rdi) + movl %r9d,200(%rdi) + movl %eax,204(%rdi) + paddd 176(%rsp),%xmm14 + paddd 0(%rsp),%xmm0 + paddd 64(%rsp),%xmm5 + paddd 128(%rsp),%xmm8 + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 16(%rsi),%edx + xorl 20(%rsi),%ecx + xorl 24(%rsi),%r9d + xorl 28(%rsi),%eax + movl %edx,16(%rdi) + movl %ecx,20(%rdi) + movl %r9d,24(%rdi) + movl %eax,28(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 80(%rsi),%edx + xorl 84(%rsi),%ecx + xorl 88(%rsi),%r9d + xorl 92(%rsi),%eax + movl %edx,80(%rdi) + movl %ecx,84(%rdi) + movl %r9d,88(%rdi) + movl %eax,92(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 144(%rsi),%edx + xorl 148(%rsi),%ecx + xorl 152(%rsi),%r9d + xorl 156(%rsi),%eax + movl %edx,144(%rdi) + movl %ecx,148(%rdi) + movl %r9d,152(%rdi) + movl %eax,156(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + xorl 208(%rsi),%edx + xorl 212(%rsi),%ecx + xorl 216(%rsi),%r9d + xorl 220(%rsi),%eax + movl %edx,208(%rdi) + movl %ecx,212(%rdi) + movl %r9d,216(%rdi) + movl %eax,220(%rdi) + paddd 224(%rsp),%xmm15 + paddd 240(%rsp),%xmm11 + paddd 16(%rsp),%xmm1 + paddd 80(%rsp),%xmm6 + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 32(%rsi),%edx + xorl 36(%rsi),%ecx + xorl 40(%rsi),%r9d + xorl 44(%rsi),%eax + movl %edx,32(%rdi) + movl %ecx,36(%rdi) + movl %r9d,40(%rdi) + movl %eax,44(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 96(%rsi),%edx + xorl 100(%rsi),%ecx + xorl 104(%rsi),%r9d + xorl 108(%rsi),%eax + movl %edx,96(%rdi) + movl %ecx,100(%rdi) + movl %r9d,104(%rdi) + movl %eax,108(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 160(%rsi),%edx + xorl 164(%rsi),%ecx + xorl 168(%rsi),%r9d + xorl 172(%rsi),%eax + movl %edx,160(%rdi) + movl %ecx,164(%rdi) + movl %r9d,168(%rdi) + movl %eax,172(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + xorl 224(%rsi),%edx + xorl 228(%rsi),%ecx + xorl 232(%rsi),%r9d + xorl 236(%rsi),%eax + movl %edx,224(%rdi) + movl %ecx,228(%rdi) + movl %r9d,232(%rdi) + movl %eax,236(%rdi) + paddd 96(%rsp),%xmm13 + paddd 144(%rsp),%xmm9 + paddd 192(%rsp),%xmm3 + paddd 32(%rsp),%xmm2 + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 48(%rsi),%edx + xorl 52(%rsi),%ecx + xorl 56(%rsi),%r9d + xorl 60(%rsi),%eax + movl %edx,48(%rdi) + movl %ecx,52(%rdi) + movl %r9d,56(%rdi) + movl %eax,60(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 112(%rsi),%edx + xorl 116(%rsi),%ecx + xorl 120(%rsi),%r9d + xorl 124(%rsi),%eax + movl %edx,112(%rdi) + movl %ecx,116(%rdi) + movl %r9d,120(%rdi) + movl %eax,124(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 176(%rsi),%edx + xorl 180(%rsi),%ecx + xorl 184(%rsi),%r9d + xorl 188(%rsi),%eax + movl %edx,176(%rdi) + movl %ecx,180(%rdi) + movl %r9d,184(%rdi) + movl %eax,188(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + xorl 240(%rsi),%edx + xorl 244(%rsi),%ecx + xorl 248(%rsi),%r9d + xorl 252(%rsi),%eax + movl %edx,240(%rdi) + movl %ecx,244(%rdi) + movl %r9d,248(%rdi) + movl %eax,252(%rdi) + movq 288(%rsp),%rdx + sub $256,%rdx + add $256,%rsi + add $256,%rdi + cmp $256,%rdx + jae .L_bytesatleast256 + cmp $0,%rdx + jbe .L_done +.L_bytes_are_64_128_or_192: + movq %rdx,288(%rsp) + movdqa 0(%r8),%xmm0 + movdqa 16(%r8),%xmm1 + movdqa 32(%r8),%xmm2 + movdqa 48(%r8),%xmm3 + movdqa %xmm1,%xmm4 + mov %rbx,%rdx +.L_mainloop2: + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm3,%xmm3 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm1 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm1,%xmm1 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm3 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm3,%xmm3 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm3,%xmm3 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm1 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm1,%xmm1 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm3 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm3 + sub $4,%rdx + paddd %xmm3,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + pxor %xmm7,%xmm7 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm3,%xmm3 + pxor %xmm6,%xmm0 + ja .L_mainloop2 + paddd 0(%r8),%xmm0 + paddd 16(%r8),%xmm1 + paddd 32(%r8),%xmm2 + paddd 48(%r8),%xmm3 + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 0(%rsi),%edx + xorl 48(%rsi),%ecx + xorl 32(%rsi),%eax + xorl 16(%rsi),%r10d + movl %edx,0(%rdi) + movl %ecx,48(%rdi) + movl %eax,32(%rdi) + movl %r10d,16(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 20(%rsi),%edx + xorl 4(%rsi),%ecx + xorl 52(%rsi),%eax + xorl 36(%rsi),%r10d + movl %edx,20(%rdi) + movl %ecx,4(%rdi) + movl %eax,52(%rdi) + movl %r10d,36(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 40(%rsi),%edx + xorl 24(%rsi),%ecx + xorl 8(%rsi),%eax + xorl 56(%rsi),%r10d + movl %edx,40(%rdi) + movl %ecx,24(%rdi) + movl %eax,8(%rdi) + movl %r10d,56(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + xorl 60(%rsi),%edx + xorl 44(%rsi),%ecx + xorl 28(%rsi),%eax + xorl 12(%rsi),%r10d + movl %edx,60(%rdi) + movl %ecx,44(%rdi) + movl %eax,28(%rdi) + movl %r10d,12(%rdi) + movq 288(%rsp),%rdx + movl 32(%r8),%ecx + movl 52(%r8),%eax + add $1,%ecx + adc $0,%eax + movl %ecx,32(%r8) + movl %eax,52(%r8) + cmp $64,%rdx + ja .L_bytes_are_128_or_192 +.L_done: + CFI_REMEMBER_STATE(); + mov %r11,%rax + sub %rsp,%rax + mov %r11,%rsp + CFI_REGISTER(%r11, %rsp) + CFI_DEF_CFA_REGISTER(%rsp) + pop %rbx + CFI_POP(%rbx) + ret + CFI_RESTORE_STATE(); +.L_bytes_are_128_or_192: + sub $64,%rdx + add $64,%rdi + add $64,%rsi + jmp .L_bytes_are_64_128_or_192 + CFI_ENDPROC(); +ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;) + +#endif /*defined(USE_SALSA20)*/ +#endif /*__x86_64*/ |