summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/igzip/igzip_set_long_icf_fg_04.asm')
-rw-r--r--src/isa-l/igzip/igzip_set_long_icf_fg_04.asm295
1 files changed, 295 insertions, 0 deletions
diff --git a/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm b/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm
new file mode 100644
index 000000000..f5c2b9803
--- /dev/null
+++ b/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "lz0a_const.asm"
+%include "data_struct2.asm"
+%include "igzip_compare_types.asm"
+%define NEQ 4
+
+default rel
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define len rdi
+%define tmp2 rdi
+%define dist rsi
+%else
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define len r8
+%define tmp2 r8
+%define dist r9
+%endif
+
+%define next_in arg1
+%define end_processed arg2
+%define end_in arg3
+%define match_lookup arg4
+%define match_in rax
+%define match_offset r10
+%define tmp1 r11
+%define end_processed_orig r12
+%define dist_code r13
+%define tmp3 r13
+
+%define ymatch_lookup ymm0
+%define ymatch_lookup2 ymm1
+%define ylens ymm2
+%define ycmp2 ymm3
+%define ylens1 ymm4
+%define ylens2 ymm5
+%define ycmp ymm6
+%define ytmp1 ymm7
+%define ytmp2 ymm8
+%define yvect_size ymm9
+%define ymax_len ymm10
+%define ytwofiftysix ymm11
+%define ynlen_mask ymm12
+%define ydists_mask ymm13
+%define ylong_lens ymm14
+%define ylens_mask ymm15
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define stack_size 10*16 + 4 * 8 + 8
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ save_reg rsi, 10*16 + 0*8
+ save_reg rdi, 10*16 + 1*8
+ save_reg r12, 10*16 + 2*8
+ save_reg r13, 10*16 + 3*8
+ end_prolog
+%endm
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+
+ mov rsi, [rsp + 10*16 + 0*8]
+ mov rdi, [rsp + 10*16 + 1*8]
+ mov r12, [rsp + 10*16 + 2*8]
+ mov r13, [rsp + 10*16 + 3*8]
+ add rsp, stack_size
+%endm
+%else
+%define func(x) x:
+%macro FUNC_SAVE 0
+ push r12
+ push r13
+%endm
+
+%macro FUNC_RESTORE 0
+ pop r13
+ pop r12
+%endm
+%endif
+%define VECT_SIZE 8
+
+global set_long_icf_fg_04
+func(set_long_icf_fg_04)
+ FUNC_SAVE
+
+ lea end_in, [next_in + arg3]
+ add end_processed, next_in
+ mov end_processed_orig, end_processed
+ lea tmp1, [end_processed + LA_STATELESS]
+ cmp end_in, tmp1
+ cmovg end_in, tmp1
+ sub end_processed, VECT_SIZE - 1
+ vmovdqu ylong_lens, [long_len]
+ vmovdqu ylens_mask, [len_mask]
+ vmovdqu ydists_mask, [dists_mask]
+ vmovdqu ynlen_mask, [nlen_mask]
+ vmovdqu yvect_size, [vect_size]
+ vmovdqu ymax_len, [max_len]
+ vmovdqu ytwofiftysix, [twofiftysix]
+ vmovdqu ymatch_lookup, [match_lookup]
+
+.fill_loop: ; Tahiti is a magical place
+ vmovdqu ymatch_lookup2, ymatch_lookup
+ vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
+
+ cmp next_in, end_processed
+ jae .end_fill
+
+.finish_entry:
+ vpand ylens, ymatch_lookup2, ylens_mask
+ vpcmpgtd ycmp, ylens, ylong_lens
+ vpmovmskb tmp1, ycmp
+
+;; Speculatively increment
+ add next_in, VECT_SIZE
+ add match_lookup, ICF_CODE_BYTES * VECT_SIZE
+
+ test tmp1, tmp1
+ jz .fill_loop
+
+ tzcnt match_offset, tmp1
+ shr match_offset, 2
+
+ lea next_in, [next_in + match_offset - VECT_SIZE]
+ lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
+ mov dist %+ d, [match_lookup]
+ vmovd ymatch_lookup2 %+ x, dist %+ d
+
+ mov tmp1, dist
+ shr dist, DIST_OFFSET
+ and dist, LIT_DIST_MASK
+ shr tmp1, EXTRA_BITS_OFFSET
+ lea tmp2, [dist_start]
+ mov dist %+ w, [tmp2 + 2 * dist]
+ add dist, tmp1
+
+ mov match_in, next_in
+ sub match_in, dist
+
+ mov len, 8
+ mov tmp3, end_in
+ sub tmp3, next_in
+
+ compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2
+
+ vmovd ylens1 %+ x, len %+ d
+ vpbroadcastd ylens1, ylens1 %+ x
+ vpsubd ylens1, ylens1, [increment]
+ vpaddd ylens1, ylens1, [twofiftyfour]
+
+ mov tmp3, end_processed
+ sub tmp3, next_in
+ cmp len, tmp3
+ cmovg len, tmp3
+
+ add next_in, len
+ lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
+ vmovdqu ymatch_lookup, [match_lookup]
+
+ vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x
+ vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask
+
+ neg len
+
+.update_match_lookup:
+ vpand ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len]
+
+ vpcmpgtd ycmp, ylens1, ylens2
+ vpcmpgtd ytmp1, ylens1, ytwofiftysix
+ vpand ycmp, ycmp, ytmp1
+ vpmovmskb tmp1, ycmp
+
+ vpcmpgtd ycmp2, ylens1, ymax_len
+ vpandn ylens, ycmp2, ylens1
+ vpand ycmp2, ymax_len, ycmp2
+ vpor ylens, ycmp2
+
+ vpaddd ylens2, ylens, ymatch_lookup2
+ vpand ylens2, ylens2, ycmp
+
+ vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2
+
+ test tmp1 %+ d, tmp1 %+ d
+ jz .fill_loop
+
+ add len, VECT_SIZE
+ vpsubd ylens1, ylens1, yvect_size
+
+ jmp .update_match_lookup
+
+.end_fill:
+ mov end_processed, end_processed_orig
+ cmp next_in, end_processed
+ jge .finish
+
+ mov tmp1, end_processed
+ sub tmp1, next_in
+ vmovd ytmp1 %+ x, tmp1 %+ d
+ vpbroadcastd ytmp1, ytmp1 %+ x
+ vpcmpgtd ytmp1, ytmp1, [increment]
+ vpand ymatch_lookup2, ymatch_lookup2, ytmp1
+ jmp .finish_entry
+
+.finish:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 64
+dist_start:
+ dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
+ dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
+ dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
+ dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
+len_mask:
+ dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
+ dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
+dists_mask:
+ dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
+ dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
+long_len:
+ dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
+increment:
+ dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+vect_size:
+ dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
+ dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
+twofiftyfour:
+ dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
+twofiftysix:
+ dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
+nlen_mask:
+ dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
+ dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
+max_len:
+ dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102
+ dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102