summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/adler32_avx2_4.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/isa-l/igzip/adler32_avx2_4.asm
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/igzip/adler32_avx2_4.asm')
-rw-r--r--src/isa-l/igzip/adler32_avx2_4.asm295
1 files changed, 295 insertions, 0 deletions
diff --git a/src/isa-l/igzip/adler32_avx2_4.asm b/src/isa-l/igzip/adler32_avx2_4.asm
new file mode 100644
index 000000000..798310fd0
--- /dev/null
+++ b/src/isa-l/igzip/adler32_avx2_4.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
+
+%define LIMIT 5552
+%define BASE 0xFFF1 ; 65521
+
+%define CHUNKSIZE 16
+%define CHUNKSIZE_M1 (CHUNKSIZE-1)
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+; need to keep free: eax, ecx, edx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1 rdi
+ %define arg2 rsi
+ %define arg3 rdx
+
+ %define init_d edi
+ %define data r9
+ %define size r10
+ %define s r11
+ %define a_d r12d
+ %define b_d r8d
+ %define end r13
+
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx
+ %define arg2 rdx
+ %define arg3 r8
+
+ %define init_d r12d
+ %define data r9
+ %define size r10
+ %define s r11
+ %define a_d esi
+ %define b_d edi
+ %define end r13
+
+ %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ save_reg rdi, 2*16 + 0*8
+ save_reg rsi, 2*16 + 1*8
+ save_reg r12, 2*16 + 2*8
+ save_reg r13, 2*16 + 3*8
+ end_prolog
+ mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ mov rdi, [rsp + 2*16 + 0*8]
+ mov rsi, [rsp + 2*16 + 1*8]
+ mov r12, [rsp + 2*16 + 2*8]
+ mov r13, [rsp + 2*16 + 3*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define ya ymm0
+%define yb ymm1
+%define ydata0 ymm2
+%define ydata1 ymm3
+%define ysa ymm4
+%define ydata ysa
+%define ytmp0 ydata0
+%define ytmp1 ydata1
+%define ytmp2 ymm5
+%define xa xmm0
+%define xb xmm1
+%define xtmp0 xmm2
+%define xtmp1 xmm3
+%define xsa xmm4
+%define xtmp2 xmm5
+%define yshuf0 ymm6
+%define yshuf1 ymm7
+
+[bits 64]
+default rel
+section .text
+
+mk_global adler32_avx2_4, function
+func(adler32_avx2_4)
+ FUNC_SAVE
+
+ vmovdqa yshuf0, [SHUF0]
+ vmovdqa yshuf1, [SHUF1]
+
+ mov data, arg2
+ mov size, arg3
+
+ mov b_d, init_d
+ shr b_d, 16
+ and init_d, 0xFFFF
+ cmp size, 32
+ jb .lt64
+ vmovd xa, init_d
+ vpxor yb, yb, yb
+.sloop1:
+ mov s, LIMIT
+ cmp s, size
+ cmova s, size ; s = min(size, LIMIT)
+ lea end, [data + s - CHUNKSIZE_M1]
+ cmp data, end
+ jae .skip_loop_1a
+align 32
+.sloop1a:
+ ; do CHUNKSIZE adds
+ vbroadcastf128 ydata, [data]
+ add data, CHUNKSIZE
+ vpshufb ydata0, ydata, yshuf0
+ vpaddd ya, ya, ydata0
+ vpaddd yb, yb, ya
+ vpshufb ydata1, ydata, yshuf1
+ vpaddd ya, ya, ydata1
+ vpaddd yb, yb, ya
+ cmp data, end
+ jb .sloop1a
+
+.skip_loop_1a:
+ add end, CHUNKSIZE_M1
+
+ test s, CHUNKSIZE_M1
+ jnz .do_final
+
+ ; either we're done, or we just did LIMIT
+ sub size, s
+
+ ; reduce
+ vpslld yb, 3 ; b is scaled by 8
+ vpmulld ysa, ya, [A_SCALE] ; scaled a
+
+ ; compute horizontal sums of ya, yb, ysa
+ vextracti128 xtmp0, ya, 1
+ vextracti128 xtmp1, yb, 1
+ vextracti128 xtmp2, ysa, 1
+ vpaddd xa, xa, xtmp0
+ vpaddd xb, xb, xtmp1
+ vpaddd xsa, xsa, xtmp2
+ vphaddd xa, xa, xa
+ vphaddd xb, xb, xb
+ vphaddd xsa, xsa, xsa
+ vphaddd xa, xa, xa
+ vphaddd xb, xb, xb
+ vphaddd xsa, xsa, xsa
+
+ vmovd eax, xa
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov a_d, edx
+
+ vpsubd xb, xb, xsa
+ vmovd eax, xb
+ add eax, b_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov b_d, edx
+
+ test size, size
+ jz .finish
+
+ ; continue loop
+ vmovd xa, a_d
+ vpxor yb, yb
+ jmp .sloop1
+
+.finish:
+ mov eax, b_d
+ shl eax, 16
+ or eax, a_d
+ jmp .end
+
+.lt64:
+ mov a_d, init_d
+ lea end, [data + size]
+ test size, size
+ jnz .final_loop
+ jmp .zero_size
+
+ ; handle remaining 1...15 bytes
+.do_final:
+ ; reduce
+ vpslld yb, 3 ; b is scaled by 8
+ vpmulld ysa, ya, [A_SCALE] ; scaled a
+
+ vextracti128 xtmp0, ya, 1
+ vextracti128 xtmp1, yb, 1
+ vextracti128 xtmp2, ysa, 1
+ vpaddd xa, xa, xtmp0
+ vpaddd xb, xb, xtmp1
+ vpaddd xsa, xsa, xtmp2
+ vphaddd xa, xa, xa
+ vphaddd xb, xb, xb
+ vphaddd xsa, xsa, xsa
+ vphaddd xa, xa, xa
+ vphaddd xb, xb, xb
+ vphaddd xsa, xsa, xsa
+ vpsubd xb, xb, xsa
+
+ vmovd a_d, xa
+ vmovd eax, xb
+ add b_d, eax
+
+align 32
+.final_loop:
+ movzx eax, byte[data]
+ add a_d, eax
+ inc data
+ add b_d, a_d
+ cmp data, end
+ jb .final_loop
+
+.zero_size:
+ mov eax, a_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ mov a_d, edx
+
+ mov eax, b_d
+ xor edx, edx
+ mov ecx, BASE
+ div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
+ shl edx, 16
+ or edx, a_d
+ mov eax, edx
+
+.end:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 32
+A_SCALE:
+ dq 0x0000000100000000, 0x0000000300000002
+ dq 0x0000000500000004, 0x0000000700000006
+SHUF0:
+ dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
+ dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
+SHUF1:
+ dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
+ dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E
+