diff options
Diffstat (limited to 'src/isa-l/igzip/adler32_avx2_4.asm')
-rw-r--r-- | src/isa-l/igzip/adler32_avx2_4.asm | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/src/isa-l/igzip/adler32_avx2_4.asm b/src/isa-l/igzip/adler32_avx2_4.asm new file mode 100644 index 000000000..8f9d6d507 --- /dev/null +++ b/src/isa-l/igzip/adler32_avx2_4.asm @@ -0,0 +1,292 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len) + +%define LIMIT 5552 +%define BASE 0xFFF1 ; 65521 + +%define CHUNKSIZE 16 +%define CHUNKSIZE_M1 (CHUNKSIZE-1) + +%include "reg_sizes.asm" + +default rel +[bits 64] + +; need to keep free: eax, ecx, edx + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg1 rdi + %define arg2 rsi + %define arg3 rdx + + %define init_d edi + %define data r9 + %define size r10 + %define s r11 + %define a_d r12d + %define b_d r8d + %define end r13 + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro + %macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx + %define arg2 rdx + %define arg3 r8 + + %define init_d r12d + %define data r9 + %define size r10 + %define s r11 + %define a_d esi + %define b_d edi + %define end r13 + + %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + save_reg rdi, 2*16 + 0*8 + save_reg rsi, 2*16 + 1*8 + save_reg r12, 2*16 + 2*8 + save_reg r13, 2*16 + 3*8 + end_prolog + mov init_d, ecx ; initalize init_d from arg1 to keep ecx free + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + mov rdi, [rsp + 2*16 + 0*8] + mov rsi, [rsp + 2*16 + 1*8] + mov r12, [rsp + 2*16 + 2*8] + mov r13, [rsp + 2*16 + 3*8] + add rsp, stack_size + %endmacro +%endif + +%define ya ymm0 +%define yb ymm1 +%define ydata0 ymm2 +%define ydata1 ymm3 +%define ysa ymm4 +%define ydata ysa +%define ytmp0 ydata0 +%define ytmp1 ydata1 +%define ytmp2 ymm5 +%define xa xmm0 +%define xb xmm1 +%define xtmp0 xmm2 +%define xtmp1 xmm3 +%define xsa xmm4 +%define xtmp2 xmm5 +%define yshuf0 ymm6 +%define yshuf1 ymm7 + + +global adler32_avx2_4:ISAL_SYM_TYPE_FUNCTION +func(adler32_avx2_4) + FUNC_SAVE + + vmovdqa yshuf0, [SHUF0] + vmovdqa yshuf1, [SHUF1] + + mov data, arg2 + mov size, arg3 + + mov b_d, init_d + shr b_d, 16 + and init_d, 0xFFFF + cmp size, 32 + jb .lt64 + vmovd xa, init_d + vpxor yb, yb, yb +.sloop1: + mov s, LIMIT + cmp s, size + cmova s, size ; s = min(size, LIMIT) + lea end, [data + s - CHUNKSIZE_M1] + cmp data, end + jae .skip_loop_1a +align 32 +.sloop1a: + ; do CHUNKSIZE adds + vbroadcastf128 ydata, [data] + add data, CHUNKSIZE + vpshufb ydata0, ydata, yshuf0 + vpaddd ya, ya, ydata0 + vpaddd yb, yb, ya + vpshufb ydata1, ydata, yshuf1 + vpaddd ya, ya, ydata1 + vpaddd yb, yb, ya + cmp data, end + jb .sloop1a + +.skip_loop_1a: + add end, CHUNKSIZE_M1 + + test s, CHUNKSIZE_M1 + jnz .do_final + + ; either we're done, or we just did LIMIT + sub size, s + + ; reduce + vpslld yb, 3 ; b is scaled by 8 + vpmulld ysa, ya, [A_SCALE] ; scaled a + + ; compute horizontal sums of ya, yb, ysa + vextracti128 xtmp0, ya, 1 + vextracti128 xtmp1, yb, 1 + vextracti128 xtmp2, ysa, 1 + vpaddd xa, xa, xtmp0 + vpaddd xb, xb, xtmp1 + vpaddd xsa, xsa, xtmp2 + vphaddd xa, xa, xa + vphaddd xb, xb, xb + vphaddd xsa, xsa, xsa + vphaddd xa, xa, xa + vphaddd xb, xb, xb + vphaddd xsa, xsa, xsa + + vmovd eax, xa + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + vpsubd xb, xb, xsa + vmovd eax, xb + add eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov b_d, edx + + test size, size + jz .finish + + ; continue loop + vmovd xa, a_d + vpxor yb, yb + jmp .sloop1 + +.finish: + mov eax, b_d + shl eax, 16 + or eax, a_d + jmp .end + +.lt64: + mov a_d, init_d + lea end, [data + size] + test size, size + jnz .final_loop + jmp .zero_size + + ; handle remaining 1...15 bytes +.do_final: + ; reduce + vpslld yb, 3 ; b is scaled by 8 + vpmulld ysa, ya, [A_SCALE] ; scaled a + + vextracti128 xtmp0, ya, 1 + vextracti128 xtmp1, yb, 1 + vextracti128 xtmp2, ysa, 1 + vpaddd xa, xa, xtmp0 + vpaddd xb, xb, xtmp1 + vpaddd xsa, xsa, xtmp2 + vphaddd xa, xa, xa + vphaddd xb, xb, xb + vphaddd xsa, xsa, xsa + vphaddd xa, xa, xa + vphaddd xb, xb, xb + vphaddd xsa, xsa, xsa + vpsubd xb, xb, xsa + + vmovd a_d, xa + vmovd eax, xb + add b_d, eax + +align 32 +.final_loop: + movzx eax, byte[data] + add a_d, eax + inc data + add b_d, a_d + cmp data, end + jb .final_loop + +.zero_size: + mov eax, a_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + mov eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + shl edx, 16 + or edx, a_d + mov eax, edx + +.end: + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 32 +A_SCALE: + dq 0x0000000100000000, 0x0000000300000002 + dq 0x0000000500000004, 0x0000000700000006 +SHUF0: + dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02 + dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06 +SHUF1: + dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A + dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E + |