summaryrefslogtreecommitdiffstats
path: root/simd-checksum-avx2.S
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 16:14:31 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 16:14:31 +0000
commit2d5707c7479eacb3b1ad98e01b53f56a88f8fb78 (patch)
treed9c334e83692851c02e3e1b8e65570c97bc82481 /simd-checksum-avx2.S
parentInitial commit. (diff)
downloadrsync-2d5707c7479eacb3b1ad98e01b53f56a88f8fb78.tar.xz
rsync-2d5707c7479eacb3b1ad98e01b53f56a88f8fb78.zip
Adding upstream version 3.2.7.upstream/3.2.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'simd-checksum-avx2.S')
-rw-r--r--simd-checksum-avx2.S177
1 files changed, 177 insertions, 0 deletions
diff --git a/simd-checksum-avx2.S b/simd-checksum-avx2.S
new file mode 100644
index 0000000..549cc3e
--- /dev/null
+++ b/simd-checksum-avx2.S
@@ -0,0 +1,177 @@
+#include "config.h"
+
+#ifdef USE_ROLL_ASM /* { */
+
+#define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */
+
+#ifdef __APPLE__
+#define get_checksum1_avx2_asm _get_checksum1_avx2_asm
+#endif
+
+.intel_syntax noprefix
+.text
+
+ .p2align 5
+ .globl get_checksum1_avx2_asm
+
+# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
+get_checksum1_avx2_asm:
+ vmovd xmm6,[rcx] # load *ps1
+ lea eax, [rsi-128] # at least 128 bytes to process?
+ cmp edx, eax
+ jg .exit
+ lea rax, .mul_T2[rip]
+ vmovntdqa ymm7, [rax] # load T2 multiplication constants
+ vmovntdqa ymm12,[rax+32]# from memory.
+ vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
+
+#if CHAR_OFFSET != 0
+ mov eax, 32*CHAR_OFFSET
+ vmovd xmm10, eax
+ vpbroadcastd ymm10, xmm10
+ mov eax, 528*CHAR_OFFSET
+ vmovd xmm13, eax
+ vpbroadcastd ymm13, xmm13
+#endif
+ vpabsb ymm15, ymm15 # set all byte size elements to 1.
+ add rdi, rdx
+ vmovdqu ymm2, [rdi] # preload the first 64 bytes.
+ vmovdqu ymm3, [rdi+32]
+ and esi, ~63 # only needed during final reduction,
+ # done here to avoid a longer nop for
+ # alignment below.
+ add edx, esi
+ shr rsi, 6 # longer opcode for alignment
+ add rdi, 64
+ vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
+ vpxor xmm4, xmm4, xmm4
+ mov eax, [r8]
+ .p2align 4 # should fit into the LSD allocation queue.
+.loop:
+ vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
+ vpmaddubsw ymm5, ymm15, ymm3
+ vmovdqu ymm8, [rdi] # preload the next
+ vmovdqu ymm9, [rdi+32] # 64 bytes.
+ add rdi, 64
+ vpaddd ymm4, ymm4, ymm6
+ vpaddw ymm5, ymm5, ymm0
+ vpsrld ymm0, ymm5, 16
+ vpaddw ymm5, ymm0, ymm5
+ vpaddd ymm6, ymm5, ymm6
+ vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
+ vpmaddubsw ymm3, ymm12, ymm3
+ prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
+ vpaddw ymm3, ymm2, ymm3
+ vpsrldq ymm2, ymm3, 2
+ vpaddd ymm3, ymm2, ymm3
+ vpaddd ymm1, ymm1, ymm3
+
+#if CHAR_OFFSET != 0
+ vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
+ vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
+#endif
+ vmovdqa ymm2, ymm8 # move the next 64 bytes
+ vmovdqa ymm3, ymm9 # into the right registers
+ sub esi, 1
+ jnz .loop
+
+ # now we reduce the partial sums.
+ vpslld ymm3, ymm4, 6
+ vpsrldq ymm2, ymm6, 4
+
+ vpaddd ymm0, ymm3, ymm1
+ vpaddd ymm6, ymm2, ymm6
+ vpsrlq ymm3, ymm0, 32
+
+ vpsrldq ymm2, ymm6, 8
+ vpaddd ymm0, ymm3, ymm0
+ vpsrldq ymm3, ymm0, 8
+ vpaddd ymm6, ymm2, ymm6
+ vpaddd ymm0, ymm3, ymm0
+ vextracti128 xmm2, ymm6, 0x1
+ vextracti128 xmm1, ymm0, 0x1
+ vpaddd xmm6, xmm2, xmm6
+ vmovd [rcx], xmm6
+ vpaddd xmm1, xmm1, xmm0
+ vmovd ecx, xmm1
+ add eax, ecx
+ mov [r8], eax
+.exit:
+ vzeroupper
+ mov eax, edx
+ ret
+
+#ifdef __APPLE__
+.data
+ .align 6
+#else
+.section .rodata
+ .p2align 6
+#endif
+.mul_T2:
+ .byte 64
+ .byte 63
+ .byte 62
+ .byte 61
+ .byte 60
+ .byte 59
+ .byte 58
+ .byte 57
+ .byte 56
+ .byte 55
+ .byte 54
+ .byte 53
+ .byte 52
+ .byte 51
+ .byte 50
+ .byte 49
+ .byte 48
+ .byte 47
+ .byte 46
+ .byte 45
+ .byte 44
+ .byte 43
+ .byte 42
+ .byte 41
+ .byte 40
+ .byte 39
+ .byte 38
+ .byte 37
+ .byte 36
+ .byte 35
+ .byte 34
+ .byte 33
+ .byte 32
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+
+#endif /* } USE_ROLL_ASM */