diff options
Diffstat (limited to 'src/isa-l/igzip/adler32_sse.asm')
-rw-r--r-- | src/isa-l/igzip/adler32_sse.asm | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/src/isa-l/igzip/adler32_sse.asm b/src/isa-l/igzip/adler32_sse.asm new file mode 100644 index 000000000..83f577d24 --- /dev/null +++ b/src/isa-l/igzip/adler32_sse.asm @@ -0,0 +1,249 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len) + +%define LIMIT 5552 +%define BASE 0xFFF1 ; 65521 + +%include "reg_sizes.asm" + +default rel +[bits 64] + +; need to keep free: eax, ecx, edx + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg1 rdi + %define arg2 rsi + %define arg3 rdx + + %define init_d edi + %define data r9 + %define size r10 + %define s r11 + %define a_d r12d + %define b_d r8d + %define end r13 + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro +%macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx + %define arg2 rdx + %define arg3 r8 + + %define init_d r12d + %define data r9 + %define size r10 + %define s r11 + %define a_d esi + %define b_d edi + %define end r13 + + %define stack_size 5*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_reg rdi, 0*8 + save_reg rsi, 1*8 + save_reg r12, 2*8 + save_reg r13, 3*8 + end_prolog + mov init_d, ecx ; initalize init_d from arg1 to keep ecx free + %endmacro + + %macro FUNC_RESTORE 0 + mov rdi, [rsp + 0*8] + mov rsi, [rsp + 1*8] + mov r12, [rsp + 2*8] + mov r13, [rsp + 3*8] + add rsp, stack_size + %endmacro +%endif + +%define xa xmm0 +%define xb xmm1 +%define xdata0 xmm2 +%define xdata1 xmm3 +%define xsa xmm4 + +global adler32_sse:ISAL_SYM_TYPE_FUNCTION +func(adler32_sse) + FUNC_SAVE + + mov data, arg2 + mov size, arg3 + + mov b_d, init_d + shr b_d, 16 + and init_d, 0xFFFF + cmp size, 32 + jb .lt64 + movd xa, init_d + pxor xb, xb +.sloop1: + mov s, LIMIT + cmp s, size + cmova s, size ; s = min(size, LIMIT) + lea end, [data + s - 7] + cmp data, end + jae .skip_loop_1a +align 32 +.sloop1a: + ; do 8 adds + pmovzxbd xdata0, [data] + pmovzxbd xdata1, [data + 4] + add data, 8 + paddd xa, xdata0 + paddd xb, xa + paddd xa, xdata1 + paddd xb, xa + cmp data, end + jb .sloop1a + +.skip_loop_1a: + add end, 7 + + test s, 7 + jnz .do_final + + ; either we're done, or we just did LIMIT + sub size, s + + ; reduce + pslld xb, 2 ; b is scaled by 4 + movdqa xsa, xa ; scaled a + pmulld xsa, [A_SCALE] + + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + + movd eax, xa + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + psubd xb, xsa + movd eax, xb + add eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov b_d, edx + + test size, size + jz .finish + + ; continue loop + movd xa, a_d + pxor xb, xb + jmp .sloop1 + +.finish: + mov eax, b_d + shl eax, 16 + or eax, a_d + jmp .end + +.lt64: + mov a_d, init_d + lea end, [data + size] + test size, size + jnz .final_loop + jmp .zero_size + + ; handle remaining 1...15 bytes +.do_final: + ; reduce + pslld xb, 2 ; b is scaled by 4 + movdqa xsa, xa ; scaled a + pmulld xsa, [A_SCALE] + + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + psubd xb, xsa + + movd a_d, xa + movd eax, xb + add b_d, eax + +align 32 +.final_loop: + movzx eax, byte[data] + add a_d, eax + inc data + add b_d, a_d + cmp data, end + jb .final_loop + +.zero_size: + mov eax, a_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + mov eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + shl edx, 16 + or edx, a_d + mov eax, edx + +.end: + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 32 +A_SCALE: + dq 0x0000000100000000, 0x0000000300000002 |