summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/adler32_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/igzip/adler32_sse.asm')
-rw-r--r--src/isa-l/igzip/adler32_sse.asm249
1 files changed, 249 insertions, 0 deletions
diff --git a/src/isa-l/igzip/adler32_sse.asm b/src/isa-l/igzip/adler32_sse.asm
new file mode 100644
index 000000000..83f577d24
--- /dev/null
+++ b/src/isa-l/igzip/adler32_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
+
+%define LIMIT 5552
+%define BASE 0xFFF1 ; 65521
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+; need to keep free: eax, ecx, edx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1 rdi
+ %define arg2 rsi
+ %define arg3 rdx
+
+ %define init_d edi
+ %define data r9
+ %define size r10
+ %define s r11
+ %define a_d r12d
+ %define b_d r8d
+ %define end r13
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ %endmacro
+%macro FUNC_RESTORE 0
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx
+ %define arg2 rdx
+ %define arg3 r8
+
+ %define init_d r12d
+ %define data r9
+ %define size r10
+ %define s r11
+ %define a_d esi
+ %define b_d edi
+ %define end r13
+
+ %define stack_size 5*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_reg rdi, 0*8
+ save_reg rsi, 1*8
+ save_reg r12, 2*8
+ save_reg r13, 3*8
+ end_prolog
+ mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ mov rdi, [rsp + 0*8]
+ mov rsi, [rsp + 1*8]
+ mov r12, [rsp + 2*8]
+ mov r13, [rsp + 3*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define xa xmm0
+%define xb xmm1
+%define xdata0 xmm2
+%define xdata1 xmm3
+%define xsa xmm4
+
+global adler32_sse:ISAL_SYM_TYPE_FUNCTION
+func(adler32_sse)
+ FUNC_SAVE
+
+ mov data, arg2
+ mov size, arg3
+
+ mov b_d, init_d
+ shr b_d, 16
+ and init_d, 0xFFFF
+ cmp size, 32
+ jb .lt64
+ movd xa, init_d
+ pxor xb, xb
+.sloop1:
+ mov s, LIMIT
+ cmp s, size
+ cmova s, size ; s = min(size, LIMIT)
+ lea end, [data + s - 7]
+ cmp data, end
+ jae .skip_loop_1a
+align 32
+.sloop1a:
+ ; do 8 adds
+ pmovzxbd xdata0, [data]
+ pmovzxbd xdata1, [data + 4]
+ add data, 8
+ paddd xa, xdata0
+ paddd xb, xa
+ paddd xa, xdata1
+ paddd xb, xa
+ cmp data, end
+ jb .sloop1a
+
+.skip_loop_1a:
+ add end, 7
+
+ test s, 7
+ jnz .do_final
+
+ ; either we're done, or we just did LIMIT
+ sub size, s
+
+ ; reduce
+ pslld xb, 2 ; b is scaled by 4
+ movdqa xsa, xa ; scaled a
+ pmulld xsa, [A_SCALE]
+
+ phaddd xa, xa
+ phaddd xb, xb
+ phaddd xsa, xsa
+ phaddd xa, xa
+ phaddd xb, xb
+ phaddd xsa, xsa
+
+ movd eax, xa
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov a_d, edx
+
+ psubd xb, xsa
+ movd eax, xb
+ add eax, b_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov b_d, edx
+
+ test size, size
+ jz .finish
+
+ ; continue loop
+ movd xa, a_d
+ pxor xb, xb
+ jmp .sloop1
+
+.finish:
+ mov eax, b_d
+ shl eax, 16
+ or eax, a_d
+ jmp .end
+
+.lt64:
+ mov a_d, init_d
+ lea end, [data + size]
+ test size, size
+ jnz .final_loop
+ jmp .zero_size
+
+ ; handle remaining 1...15 bytes
+.do_final:
+ ; reduce
+ pslld xb, 2 ; b is scaled by 4
+ movdqa xsa, xa ; scaled a
+ pmulld xsa, [A_SCALE]
+
+ phaddd xa, xa
+ phaddd xb, xb
+ phaddd xsa, xsa
+ phaddd xa, xa
+ phaddd xb, xb
+ phaddd xsa, xsa
+ psubd xb, xsa
+
+ movd a_d, xa
+ movd eax, xb
+ add b_d, eax
+
+align 32
+.final_loop:
+ movzx eax, byte[data]
+ add a_d, eax
+ inc data
+ add b_d, a_d
+ cmp data, end
+ jb .final_loop
+
+.zero_size:
+ mov eax, a_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov a_d, edx
+
+ mov eax, b_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ shl edx, 16
+ or edx, a_d
+ mov eax, edx
+
+.end:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 32
+A_SCALE:
+ dq 0x0000000100000000, 0x0000000300000002