diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm | |
parent | Initial commit. (diff) | |
download | ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm')
-rw-r--r-- | src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm | 741 |
1 files changed, 741 insertions, 0 deletions
diff --git a/src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm b/src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm new file mode 100644 index 000000000..9b5e85ae3 --- /dev/null +++ b/src/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm @@ -0,0 +1,741 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "lz0a_const.asm" +%include "data_struct2.asm" +%include "huffman.asm" + + +%define USE_HSWNI +%define ARCH 04 + +%ifidn __OUTPUT_FORMAT__, win64 +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define hash rsi +%define next_in rdi +%else +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define hash r8 +%define next_in rcx +%endif + +%define stream arg1 +%define level_buf arg1 +%define matches_next arg2 +%define f_i_end arg3 + +%define f_i rax +%define file_start rbp +%define tmp r9 +%define tmp2 r10 +%define prev_len r11 +%define prev_dist r12 +%define f_i_orig r13 + +%define hash_table level_buf + _hash_map_hash_table + +%define datas ymm0 +%define datas_lookup ymm1 +%define yhashes ymm2 +%define ydists ymm3 +%define ydists_lookup ymm4 + +%define ydownconvert_qd ymm5 +%define ydists2 ymm5 +%define yscatter ymm5 +%define ytmp2 ymm5 +%define ynull_syms ymm5 + +%define ylens1 ymm6 +%define ylens2 ymm7 +%define ylookup ymm8 +%define ylookup2 ymm9 +%define yindex ymm10 + +%define yrot_left ymm11 +%define yshift_finish ymm11 +%define yqword_shuf ymm11 +%define yhash_prod ymm11 +%define ycode ymm11 +%define ytmp3 ymm11 + +%define yones ymm12 +%define ydatas_perm2 ymm13 +%define yincrement ymm14 + +%define ytmp ymm15 +%define ydist_extra ymm15 +%define yhash_mask ymm15 +%define ydist_mask ymm15 + +%ifidn __OUTPUT_FORMAT__, win64 +%define stack_size 10*16 + 6 * 8 + 3 * 8 +%define local_storage_offset (stack_size - 16) +%define func(x) proc_frame x + +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqu [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + save_reg rsi, 10*16 + 0*8 + save_reg rdi, 10*16 + 1*8 + save_reg rbp, 10*16 + 2*8 + save_reg r12, 10*16 + 3*8 + save_reg r13, 10*16 + 4*8 + end_prolog +%endm + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + + mov rsi, [rsp + 10*16 + 0*8] + mov rdi, [rsp + 10*16 + 1*8] + mov rbp, [rsp + 10*16 + 2*8] + mov r12, [rsp + 10*16 + 3*8] + mov r13, [rsp + 10*16 + 4*8] + add rsp, stack_size +%endm +%else +%define stack_size 16 +%define local_storage_offset 0 + +%define func(x) x: +%macro FUNC_SAVE 0 + push rbp + push r12 + push r13 + sub rsp, stack_size +%endm + +%macro FUNC_RESTORE 0 + add rsp, stack_size + pop r13 + pop r12 + pop rbp +%endm +%endif + +%define dist_mask_offset local_storage_offset +%define hash_mask_offset local_storage_offset + 8 + +%define VECT_SIZE 8 +%define HASH_BYTES 2 + +global gen_icf_map_lh1_04 +func(gen_icf_map_lh1_04) + FUNC_SAVE + + mov file_start, [stream + _next_in] + mov f_i %+ d, dword [stream + _total_in] + mov f_i_orig, f_i + + sub file_start, f_i + add f_i_end, f_i + cmp f_i, f_i_end + jge end_main + +;; Prep for main loop + mov tmp %+ d, dword [stream + _internal_state_dist_mask] + mov [rsp + dist_mask_offset], tmp + mov tmp %+ d, dword [stream + _internal_state_hash_mask] + mov [rsp + hash_mask_offset], tmp + mov tmp, stream + mov level_buf, [stream + _level_buf] + sub f_i_end, LA + vmovdqu yincrement, [increment] + vpbroadcastd yones, [ones] + vmovdqu ydatas_perm2, [datas_perm2] + +;; Process first byte + vpbroadcastd yhash_prod, [hash_prod] + vpbroadcastd yhash_mask, [rsp + hash_mask_offset] + vmovd yhashes %+ x, dword [f_i + file_start] + vpmaddwd yhashes, yhashes, yhash_prod + vpmaddwd yhashes, yhashes, yhash_prod + vpand yhashes, yhashes, yhash_mask + vmovd hash %+ d, yhashes %+ x + cmp byte [tmp + _internal_state_has_hist], IGZIP_NO_HIST + jne .has_hist + ;; No history, the byte is a literal + xor prev_len, prev_len + xor prev_dist, prev_dist + mov byte [tmp + _internal_state_has_hist], IGZIP_HIST + jmp .byte_processed + +.has_hist: + ;; History exists, need to set prev_len and prev_dist accordingly + lea next_in, [f_i + file_start] + + ;; Determine match lookback distance + xor tmp, tmp + mov tmp %+ w, f_i %+ w + dec tmp + sub tmp %+ w, word [hash_table + HASH_BYTES * hash] + + and tmp %+ d, [rsp + dist_mask_offset] + neg tmp + + ;; Check first 8 bytes of match + mov prev_len, [next_in] + xor prev_len, [next_in + tmp - 1] + neg tmp + + ;; Set prev_dist +%ifidn arg1, rcx + mov tmp2, rcx +%endif + ;; The third register is unused on Haswell and later, + ;; This line will not work on previous architectures + get_dist_icf_code tmp, prev_dist, tmp + +%ifidn arg1, rcx + mov rcx, tmp2 +%endif + + ;; Set prev_len + xor tmp2, tmp2 + tzcnt prev_len, prev_len + shr prev_len, 3 + cmp prev_len, MIN_DEF_MATCH + cmovl prev_len, tmp2 + +.byte_processed: + mov word [hash_table + HASH_BYTES * hash], f_i %+ w + + add f_i, 1 + +;;hash + vmovdqu datas, [f_i + file_start] + vpermq yhashes, datas, 0x44 + vpshufb yhashes, yhashes, [datas_shuf] + vpmaddwd yhashes, yhashes, yhash_prod + vpmaddwd yhashes, yhashes, yhash_prod + vpand yhashes, yhashes, yhash_mask + + vpermq ylookup, datas, 0x44 + vmovdqu yqword_shuf, [qword_shuf] + vpshufb ylookup, ylookup, yqword_shuf + vpermd ylookup2, ydatas_perm2, datas + vpshufb ylookup2, ylookup2, yqword_shuf + +;;gather/scatter hashes + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp + + vpbroadcastd ytmp2, [upper_word] + vpbroadcastd ytmp, [low_word] + vmovd yindex %+ x, f_i %+ d + vpbroadcastd yindex, yindex %+ x + vpaddd yindex, yindex, yincrement + vpand yscatter, ydists_lookup, ytmp2 + vpand ytmp, yindex, ytmp + vpor yscatter, yscatter, ytmp + + vmovd tmp %+ d, yhashes %+ x + vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x + vpextrd tmp %+ d, yhashes %+ x, 1 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 + vpextrd tmp %+ d, yhashes %+ x, 2 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 + vpextrd tmp %+ d,yhashes %+ x, 3 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 + + vextracti128 yscatter %+ x, yscatter, 1 + vextracti128 yhashes %+ x, yhashes, 1 + + vmovd tmp %+ d, yhashes %+ x + vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x + vpextrd tmp %+ d, yhashes %+ x, 1 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 + vpextrd tmp %+ d, yhashes %+ x, 2 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 + vpextrd tmp %+ d,yhashes %+ x, 3 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 + +;; Compute hash for next loop + vpbroadcastd yhash_prod, [hash_prod] + vpbroadcastd yhash_mask, [rsp + hash_mask_offset] + vmovdqu datas, [f_i + file_start + VECT_SIZE] + vpermq yhashes, datas, 0x44 + vpshufb yhashes, yhashes, [datas_shuf] + vpmaddwd yhashes, yhashes, yhash_prod + vpmaddwd yhashes, yhashes, yhash_prod + vpand yhashes, yhashes, yhash_mask + + vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE] + + sub f_i_end, VECT_SIZE + cmp f_i, f_i_end + jg .loop1_end + +.loop1: + lea next_in, [f_i + file_start] + +;; Calculate look back dists + vpbroadcastd ydist_mask, [rsp + dist_mask_offset] + vpaddd ydists, ydists_lookup, yones + vpsubd ydists, yindex, ydists + vpand ydists, ydists, ydist_mask + vpaddd ydists, ydists, yones + vpsubd ydists, yincrement, ydists + +;;gather/scatter hashes + add f_i, VECT_SIZE + + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp + + vpbroadcastd ytmp2, [upper_word] + vpbroadcastd ytmp, [low_word] + vmovd yindex %+ x, f_i %+ d + vpbroadcastd yindex, yindex %+ x + vpaddd yindex, yindex, yincrement + vpand yscatter, ydists_lookup, ytmp2 + vpand ytmp, yindex, ytmp + vpor yscatter, yscatter, ytmp + + vmovd tmp %+ d, yhashes %+ x + vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x + vpextrd tmp %+ d, yhashes %+ x, 1 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 + vpextrd tmp %+ d, yhashes %+ x, 2 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 + vpextrd tmp %+ d,yhashes %+ x, 3 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 + + vextracti128 yscatter %+ x, yscatter, 1 + vextracti128 yhashes %+ x, yhashes, 1 + + vmovd tmp %+ d, yhashes %+ x + vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x + vpextrd tmp %+ d, yhashes %+ x, 1 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 + vpextrd tmp %+ d, yhashes %+ x, 2 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 + vpextrd tmp %+ d,yhashes %+ x, 3 + vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 + +;; Compute hash for next loop + vpbroadcastd yhash_prod, [hash_prod] + vpbroadcastd yhash_mask, [rsp + hash_mask_offset] + vpermq yhashes, datas_lookup, 0x44 + vpshufb yhashes, yhashes, [datas_shuf] + vpmaddwd yhashes, yhashes, yhash_prod + vpmaddwd yhashes, yhashes, yhash_prod + vpand yhashes, yhashes, yhash_mask + +;;lookup old codes + vextracti128 ydists2 %+ x, ydists, 1 + + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdq ylens1, [next_in + ydists %+ x], ytmp + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp + +;; Calculate dist_icf_code + vpaddd ydists, ydists, yones + vpsubd ydists, yincrement, ydists + + vpbroadcastd ytmp2, [low_nibble] + vbroadcasti128 ytmp3, [nibble_order] + vpslld ydist_extra, ydists, 12 + vpor ydist_extra, ydists, ydist_extra + vpand ydist_extra, ydist_extra, ytmp2 + vpshufb ydist_extra, ydist_extra, ytmp3 + vbroadcasti128 ytmp2, [bit_index] + vpshufb ydist_extra, ytmp2, ydist_extra + vpxor ytmp2, ytmp2, ytmp2 + vpcmpgtb ytmp2, ydist_extra, ytmp2 + vpsrld ytmp3, ytmp2, 8 + vpandn ytmp2, ytmp3, ytmp2 + vpsrld ytmp3, ytmp2, 16 + vpandn ytmp2, ytmp3, ytmp2 + vpsrld ytmp3, ytmp2, 24 + vpandn ytmp2, ytmp3, ytmp2 + vpbroadcastd ytmp3, [base_offset] + vpaddb ydist_extra, ytmp3 + vpand ydist_extra, ydist_extra, ytmp2 + vpsrlq ytmp2, ydist_extra, 32 + vpxor ytmp3, ytmp3, ytmp3 + vpsadbw ydist_extra, ydist_extra, ytmp3 + vpsadbw ytmp2, ytmp2, ytmp3 + vpsubd ydist_extra, ydist_extra, ytmp2 + vpsllq ytmp2, ytmp2, 32 + vpor ydist_extra, ydist_extra, ytmp2 + vpcmpgtb ytmp3, ydist_extra, ytmp3 + vpand ydist_extra, ydist_extra, ytmp3 + + vpsllvd ycode, yones, ydist_extra + vpsubd ycode, ycode, yones + vpcmpgtd ytmp2, ydists, yones + vpand ycode, ydists, ycode + vpand ycode, ycode, ytmp2 + vpsrlvd ydists, ydists, ydist_extra + vpslld ydist_extra, ydist_extra, 1 + vpaddd ydists, ydists, ydist_extra + vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET + vpaddd ydists, ydists, ycode + +;; Setup ydists for combining with ylens + vpslld ydists, ydists, DIST_OFFSET + +;; xor current data with lookback dist + vpxor ylens1, ylens1, ylookup + vpxor ylens2, ylens2, ylookup2 + +;; Setup registers for next loop + vpermq ylookup, datas, 0x44 + vmovdqu yqword_shuf, [qword_shuf] + vpshufb ylookup, ylookup, yqword_shuf + vpermd ylookup2, ydatas_perm2, datas + vpshufb ylookup2, ylookup2, yqword_shuf + +;; Compute match length + vpxor ytmp, ytmp, ytmp + vpcmpeqb ylens1, ylens1, ytmp + vpcmpeqb ylens2, ylens2, ytmp + vpbroadcastq yshift_finish, [shift_finish] + vpand ylens1, ylens1, yshift_finish + vpand ylens2, ylens2, yshift_finish + vpsadbw ylens1, ylens1, ytmp + vpsadbw ylens2, ylens2, ytmp + vmovdqu ydownconvert_qd, [downconvert_qd] + vpshufb ylens1, ylens1, ydownconvert_qd + vextracti128 ytmp %+ x, ylens1, 1 + vpor ylens1, ylens1, ytmp + vpshufb ylens2, ylens2, ydownconvert_qd + vextracti128 ytmp %+ x, ylens2, 1 + vpor ylens2, ylens2, ytmp + vinserti128 ylens1, ylens1, ylens2 %+ x, 1 + vpbroadcastd ytmp, [low_nibble] + vpsrld ylens2, ylens1, 4 + vpand ylens1, ylens1, ytmp + vbroadcasti128 ytmp, [match_cnt_perm] + vpbroadcastd ytmp2, [match_cnt_low_max] + vpshufb ylens1, ytmp, ylens1 + vpshufb ylens2, ytmp, ylens2 + vpcmpeqb ytmp, ylens1, ytmp2 + vpand ylens2, ylens2, ytmp + vpaddd ylens1, ylens1, ylens2 + +;; Preload for next loops + vmovdqu datas, datas_lookup + vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE] + +;; Zero out matches which should not be taken + vmovdqu yrot_left, [drot_left] + vpermd ylens2, yrot_left, ylens1 + vpermd ydists, yrot_left, ydists + + vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0 + vmovd prev_len %+ d, ylens2 %+ x + vinserti128 ylens2, ylens2, ytmp %+ x, 0 + + vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0 + vmovd prev_dist %+ d, ydists %+ x + vinserti128 ydists, ydists, ytmp %+ x, 0 + + vpbroadcastd ytmp, [shortest_matches] + vpcmpgtd ytmp, ylens2, ytmp + vpcmpgtd ytmp2, ylens1, ylens2 + + vpcmpeqd ytmp3, ytmp3, ytmp3 + vpxor ytmp, ytmp, ytmp3 + vpor ytmp, ytmp, ytmp2 + + vpandn ylens1, ytmp, ylens2 + +;; Update zdists to match ylens1 + vpbroadcastd ytmp2, [twofiftyfour] + vpaddd ydists, ydists, ylens1 + vpaddd ydists, ydists, ytmp2 + + vpbroadcastd ynull_syms, [null_dist_syms] + vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1] + vpaddd ytmp3, ynull_syms + vpand ytmp3, ytmp3, ytmp + vpandn ydists, ytmp, ydists + vpor ydists, ydists, ytmp3 + +;;Store ydists + vmovdqu [matches_next], ydists + add matches_next, ICF_CODE_BYTES * VECT_SIZE + + cmp f_i, f_i_end + jle .loop1 + +.loop1_end: + lea next_in, [f_i + file_start] + +;; Calculate look back dists + vpbroadcastd ydist_mask, [rsp + dist_mask_offset] + vpaddd ydists, ydists_lookup, yones + vpsubd ydists, yindex, ydists + vpand ydists, ydists, ydist_mask + vpaddd ydists, ydists, yones + vpsubd ydists, yincrement, ydists + +;;lookup old codes + vextracti128 ydists2 %+ x, ydists, 1 + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdq ylens1, [next_in + ydists %+ x], ytmp + vpcmpeqq ytmp, ytmp, ytmp + vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp + +;; Restore last update hash value + vpextrd tmp %+ d, ydists2 %+ x, 3 + add tmp %+ d, f_i %+ d + + vpbroadcastd yhash_prod %+ x, [hash_prod] + vpbroadcastd yhash_mask %+ x, [rsp + hash_mask_offset] + + vmovd yhashes %+ x, dword [f_i + file_start + VECT_SIZE - 1] + vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x + vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x + vpand yhashes %+ x, yhashes %+ x, yhash_mask %+ x + vmovd hash %+ d, yhashes %+ x + + mov word [hash_table + HASH_BYTES * hash], tmp %+ w + +;; Calculate dist_icf_code + vpaddd ydists, ydists, yones + vpsubd ydists, yincrement, ydists + + vpbroadcastd ytmp2, [low_nibble] + vbroadcasti128 ytmp3, [nibble_order] + vpslld ydist_extra, ydists, 12 + vpor ydist_extra, ydists, ydist_extra + vpand ydist_extra, ydist_extra, ytmp2 + vpshufb ydist_extra, ydist_extra, ytmp3 + vbroadcasti128 ytmp2, [bit_index] + vpshufb ydist_extra, ytmp2, ydist_extra + vpxor ytmp2, ytmp2, ytmp2 + vpcmpgtb ytmp2, ydist_extra, ytmp2 + vpsrld ytmp3, ytmp2, 8 + vpandn ytmp2, ytmp3, ytmp2 + vpsrld ytmp3, ytmp2, 16 + vpandn ytmp2, ytmp3, ytmp2 + vpsrld ytmp3, ytmp2, 24 + vpandn ytmp2, ytmp3, ytmp2 + vpbroadcastd ytmp3, [base_offset] + vpaddb ydist_extra, ytmp3 + vpand ydist_extra, ydist_extra, ytmp2 + vpsrlq ytmp2, ydist_extra, 32 + vpxor ytmp3, ytmp3, ytmp3 + vpsadbw ydist_extra, ydist_extra, ytmp3 + vpsadbw ytmp2, ytmp2, ytmp3 + vpsubd ydist_extra, ydist_extra, ytmp2 + vpsllq ytmp2, ytmp2, 32 + vpor ydist_extra, ydist_extra, ytmp2 + vpcmpgtb ytmp3, ydist_extra, ytmp3 + vpand ydist_extra, ydist_extra, ytmp3 + + vpsllvd ycode, yones, ydist_extra + vpsubd ycode, ycode, yones + vpcmpgtd ytmp2, ydists, yones + vpand ycode, ydists, ycode + vpand ycode, ycode, ytmp2 + vpsrlvd ydists, ydists, ydist_extra + vpslld ydist_extra, ydist_extra, 1 + vpaddd ydists, ydists, ydist_extra + vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET + vpaddd ydists, ydists, ycode + +;; Setup ydists for combining with ylens + vpslld ydists, ydists, DIST_OFFSET + +;; xor current data with lookback dist + vpxor ylens1, ylens1, ylookup + vpxor ylens2, ylens2, ylookup2 + +;; Compute match length + vpxor ytmp, ytmp, ytmp + vpcmpeqb ylens1, ylens1, ytmp + vpcmpeqb ylens2, ylens2, ytmp + vpbroadcastq yshift_finish, [shift_finish] + vpand ylens1, ylens1, yshift_finish + vpand ylens2, ylens2, yshift_finish + vpsadbw ylens1, ylens1, ytmp + vpsadbw ylens2, ylens2, ytmp + vmovdqu ydownconvert_qd, [downconvert_qd] + vpshufb ylens1, ylens1, ydownconvert_qd + vextracti128 ytmp %+ x, ylens1, 1 + vpor ylens1, ylens1, ytmp + vpshufb ylens2, ylens2, ydownconvert_qd + vextracti128 ytmp %+ x, ylens2, 1 + vpor ylens2, ylens2, ytmp + vinserti128 ylens1, ylens1, ylens2 %+ x, 1 + vpbroadcastd ytmp, [low_nibble] + vpsrld ylens2, ylens1, 4 + vpand ylens1, ylens1, ytmp + vbroadcasti128 ytmp, [match_cnt_perm] + vpbroadcastd ytmp2, [match_cnt_low_max] + vpshufb ylens1, ytmp, ylens1 + vpshufb ylens2, ytmp, ylens2 + vpcmpeqb ytmp, ylens1, ytmp2 + vpand ylens2, ylens2, ytmp + vpaddd ylens1, ylens1, ylens2 + +;; Zero out matches which should not be taken + vmovdqu yrot_left, [drot_left] + vpermd ylens2, yrot_left, ylens1 + vpermd ydists, yrot_left, ydists + + vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0 + vinserti128 ylens2, ylens2, ytmp %+ x, 0 + + vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0 + vinserti128 ydists, ydists, ytmp %+ x, 0 + + vpbroadcastd ytmp, [shortest_matches] + vpcmpgtd ytmp, ylens2, ytmp + vpcmpgtd ytmp2, ylens1, ylens2 + + vpcmpeqd ytmp3, ytmp3, ytmp3 + vpxor ytmp, ytmp, ytmp3 + vpor ytmp, ytmp, ytmp2 + + vpandn ylens1, ytmp, ylens2 + +;; Update zdists to match ylens1 + vpbroadcastd ytmp2, [twofiftyfour] + vpaddd ydists, ydists, ylens1 + vpaddd ydists, ydists, ytmp2 + + vpbroadcastd ynull_syms, [null_dist_syms] + vpmovzxbd ytmp3, [f_i + file_start - 1] + vpaddd ytmp3, ynull_syms + vpand ytmp3, ytmp3, ytmp + vpandn ydists, ytmp, ydists + vpor ydists, ydists, ytmp3 + +;;Store ydists + vmovdqu [matches_next], ydists + add f_i, VECT_SIZE + +end_main: + sub f_i, f_i_orig + sub f_i, 1 + +%ifnidn f_i, rax + mov rax, f_i +%endif + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 32 +;; 32 byte data +datas_perm2: + dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4 +drot_left: + dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 +datas_shuf: + db 0x0, 0x1, 0x2, 0x3 + db 0x1, 0x2, 0x3, 0x4 + db 0x2, 0x3, 0x4, 0x5 + db 0x3, 0x4, 0x5, 0x6 + db 0x4, 0x5, 0x6, 0x7 + db 0x5, 0x6, 0x7, 0x8 + db 0x6, 0x7, 0x8, 0x9 + db 0x7, 0x8, 0x9, 0xa +qword_shuf: + db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 + db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9 + db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa +increment: + dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 +downconvert_qd: + db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff + db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff + +;; 16 byte data +match_cnt_perm: + db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4 +bit_index: + db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3 + db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4 +nibble_order: + db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7 + db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf + +;; 8 byte data +shift_finish: + db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + +;; 4 byte data +ones: + dd 0x1 +%define PROD1 0xE84B +%define PROD2 0x97B1 +hash_prod: + dw PROD1, PROD2 +null_dist_syms: + dd LIT +twofiftyfour: + dd 0xfe +shortest_matches: + dd MIN_DEF_MATCH +upper_word: + dw 0x0000, 0xffff +low_word: + dw 0xffff, 0x0000 +low_nibble: + db 0x0f, 0x0f, 0x0f, 0x0f +match_cnt_low_max: + dd 0x4 +base_offset: + db -0x2, 0x2, 0x6, 0xa |