diff options
Diffstat (limited to 'simd-checksum-avx2.S')
-rw-r--r-- | simd-checksum-avx2.S | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/simd-checksum-avx2.S b/simd-checksum-avx2.S new file mode 100644 index 0000000..549cc3e --- /dev/null +++ b/simd-checksum-avx2.S @@ -0,0 +1,177 @@ +#include "config.h" + +#ifdef USE_ROLL_ASM /* { */ + +#define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */ + +#ifdef __APPLE__ +#define get_checksum1_avx2_asm _get_checksum1_avx2_asm +#endif + +.intel_syntax noprefix +.text + + .p2align 5 + .globl get_checksum1_avx2_asm + +# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2 +get_checksum1_avx2_asm: + vmovd xmm6,[rcx] # load *ps1 + lea eax, [rsi-128] # at least 128 bytes to process? + cmp edx, eax + jg .exit + lea rax, .mul_T2[rip] + vmovntdqa ymm7, [rax] # load T2 multiplication constants + vmovntdqa ymm12,[rax+32]# from memory. + vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1. + +#if CHAR_OFFSET != 0 + mov eax, 32*CHAR_OFFSET + vmovd xmm10, eax + vpbroadcastd ymm10, xmm10 + mov eax, 528*CHAR_OFFSET + vmovd xmm13, eax + vpbroadcastd ymm13, xmm13 +#endif + vpabsb ymm15, ymm15 # set all byte size elements to 1. + add rdi, rdx + vmovdqu ymm2, [rdi] # preload the first 64 bytes. + vmovdqu ymm3, [rdi+32] + and esi, ~63 # only needed during final reduction, + # done here to avoid a longer nop for + # alignment below. + add edx, esi + shr rsi, 6 # longer opcode for alignment + add rdi, 64 + vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators. + vpxor xmm4, xmm4, xmm4 + mov eax, [r8] + .p2align 4 # should fit into the LSD allocation queue. +.loop: + vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums + vpmaddubsw ymm5, ymm15, ymm3 + vmovdqu ymm8, [rdi] # preload the next + vmovdqu ymm9, [rdi+32] # 64 bytes. + add rdi, 64 + vpaddd ymm4, ymm4, ymm6 + vpaddw ymm5, ymm5, ymm0 + vpsrld ymm0, ymm5, 16 + vpaddw ymm5, ymm0, ymm5 + vpaddd ymm6, ymm5, ymm6 + vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums + vpmaddubsw ymm3, ymm12, ymm3 + prefetcht0 [rdi+384] # prefetch 6 cachelines ahead. + vpaddw ymm3, ymm2, ymm3 + vpsrldq ymm2, ymm3, 2 + vpaddd ymm3, ymm2, ymm3 + vpaddd ymm1, ymm1, ymm3 + +#if CHAR_OFFSET != 0 + vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET + vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET +#endif + vmovdqa ymm2, ymm8 # move the next 64 bytes + vmovdqa ymm3, ymm9 # into the right registers + sub esi, 1 + jnz .loop + + # now we reduce the partial sums. + vpslld ymm3, ymm4, 6 + vpsrldq ymm2, ymm6, 4 + + vpaddd ymm0, ymm3, ymm1 + vpaddd ymm6, ymm2, ymm6 + vpsrlq ymm3, ymm0, 32 + + vpsrldq ymm2, ymm6, 8 + vpaddd ymm0, ymm3, ymm0 + vpsrldq ymm3, ymm0, 8 + vpaddd ymm6, ymm2, ymm6 + vpaddd ymm0, ymm3, ymm0 + vextracti128 xmm2, ymm6, 0x1 + vextracti128 xmm1, ymm0, 0x1 + vpaddd xmm6, xmm2, xmm6 + vmovd [rcx], xmm6 + vpaddd xmm1, xmm1, xmm0 + vmovd ecx, xmm1 + add eax, ecx + mov [r8], eax +.exit: + vzeroupper + mov eax, edx + ret + +#ifdef __APPLE__ +.data + .align 6 +#else +.section .rodata + .p2align 6 +#endif +.mul_T2: + .byte 64 + .byte 63 + .byte 62 + .byte 61 + .byte 60 + .byte 59 + .byte 58 + .byte 57 + .byte 56 + .byte 55 + .byte 54 + .byte 53 + .byte 52 + .byte 51 + .byte 50 + .byte 49 + .byte 48 + .byte 47 + .byte 46 + .byte 45 + .byte 44 + .byte 43 + .byte 42 + .byte 41 + .byte 40 + .byte 39 + .byte 38 + .byte 37 + .byte 36 + .byte 35 + .byte 34 + .byte 33 + .byte 32 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + +#endif /* } USE_ROLL_ASM */ |