summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/igzip_set_long_icf_fg_06.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/igzip/igzip_set_long_icf_fg_06.asm')
-rw-r--r--src/isa-l/igzip/igzip_set_long_icf_fg_06.asm367
1 files changed, 367 insertions, 0 deletions
diff --git a/src/isa-l/igzip/igzip_set_long_icf_fg_06.asm b/src/isa-l/igzip/igzip_set_long_icf_fg_06.asm
new file mode 100644
index 000000000..39708eda7
--- /dev/null
+++ b/src/isa-l/igzip/igzip_set_long_icf_fg_06.asm
@@ -0,0 +1,367 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "lz0a_const.asm"
+%include "data_struct2.asm"
+%include "igzip_compare_types.asm"
+%define NEQ 4
+
+%ifdef HAVE_AS_KNOWS_AVX512
+%ifidn __OUTPUT_FORMAT__, win64
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define len rdi
+%define dist rsi
+%else
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define len r8
+%define dist r9
+%endif
+
+%define next_in arg1
+%define end_processed arg2
+%define end_in arg3
+%define match_lookup arg4
+%define match_in rax
+%define match_offset r10
+%define tmp1 r11
+%define end_processed_orig r12
+%define dist_code r13
+%define tmp2 r13
+
+%define zmatch_lookup zmm0
+%define zmatch_lookup2 zmm1
+%define zlens zmm2
+%define zdist_codes zmm3
+%define zdist_extras zmm4
+%define zdists zmm5
+%define zdists2 zmm6
+%define zlens1 zmm7
+%define zlens2 zmm8
+%define zlookup zmm9
+%define zlookup2 zmm10
+%define datas zmm11
+%define ztmp1 zmm12
+%define ztmp2 zmm13
+%define zvect_size zmm16
+%define zmax_len zmm17
+%define ztwofiftyfour zmm18
+%define ztwofiftysix zmm19
+%define ztwosixtytwo zmm20
+%define znlen_mask zmm21
+%define zbswap zmm22
+%define zqword_shuf zmm23
+%define zdatas_perm3 zmm24
+%define zdatas_perm2 zmm25
+%define zincrement zmm26
+%define zdists_mask zmm27
+%define zdists_start zmm28
+%define zlong_lens2 zmm29
+%define zlong_lens zmm30
+%define zlens_mask zmm31
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define stack_size 8*16 + 4 * 8 + 8
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ save_reg rsi, 8*16 + 0*8
+ save_reg rdi, 8*16 + 1*8
+ save_reg r12, 8*16 + 2*8
+ save_reg r13, 8*16 + 3*8
+ end_prolog
+%endm
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+
+ mov rsi, [rsp + 8*16 + 0*8]
+ mov rdi, [rsp + 8*16 + 1*8]
+ mov r12, [rsp + 8*16 + 2*8]
+ mov r13, [rsp + 8*16 + 3*8]
+ add rsp, stack_size
+%endm
+%else
+%define func(x) x:
+%macro FUNC_SAVE 0
+ push r12
+ push r13
+%endm
+
+%macro FUNC_RESTORE 0
+ pop r13
+ pop r12
+%endm
+%endif
+%define VECT_SIZE 16
+
+global set_long_icf_fg_06
+func(set_long_icf_fg_06)
+ FUNC_SAVE
+
+ lea end_in, [next_in + arg3]
+ add end_processed, next_in
+ mov end_processed_orig, end_processed
+ lea tmp1, [end_processed + LA_STATELESS]
+ cmp end_in, tmp1
+ cmovg end_in, tmp1
+ sub end_processed, 15
+ vpbroadcastd zlong_lens, [long_len]
+ vpbroadcastd zlong_lens2, [long_len2]
+ vpbroadcastd zlens_mask, [len_mask]
+ vmovdqu16 zdists_start, [dist_start]
+ vpbroadcastd zdists_mask, [dists_mask]
+ vmovdqu32 zincrement, [increment]
+ vbroadcasti64x2 zdatas_perm2, [datas_perm2]
+ vbroadcasti64x2 zdatas_perm3, [datas_perm3]
+ vmovdqu64 zqword_shuf, [qword_shuf]
+ vbroadcasti64x2 zbswap, [bswap_shuf]
+ vpbroadcastd znlen_mask, [nlen_mask]
+ vpbroadcastd zvect_size, [vect_size]
+ vpbroadcastd zmax_len, [max_len]
+ vpbroadcastd ztwofiftyfour, [twofiftyfour]
+ vpbroadcastd ztwofiftysix, [twofiftysix]
+ vpbroadcastd ztwosixtytwo, [twosixtytwo]
+ vmovdqu32 zmatch_lookup, [match_lookup]
+
+.fill_loop: ; Tahiti is a magical place
+ vmovdqu32 zmatch_lookup2, zmatch_lookup
+ vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
+
+ cmp next_in, end_processed
+ jae .end_fill
+
+.finish_entry:
+ vpandd zlens, zmatch_lookup2, zlens_mask
+ vpcmpgtd k3, zlens, zlong_lens
+
+;; Speculatively increment
+ add next_in, VECT_SIZE
+ add match_lookup, ICF_CODE_BYTES * VECT_SIZE
+
+ ktestw k3, k3
+ jz .fill_loop
+
+ vpsrld zdist_codes, zmatch_lookup2, DIST_OFFSET
+ vpmovdw zdists %+ y, zdist_codes ; Relies on perm working mod 32
+ vpermw zdists, zdists, zdists_start
+ vpmovzxwd zdists, zdists %+ y
+
+ vpsrld zdist_extras, zmatch_lookup2, EXTRA_BITS_OFFSET
+ vpsubd zdist_extras, zincrement, zdist_extras
+
+ vpsubd zdists, zdist_extras, zdists
+ vextracti32x8 zdists2 %+ y, zdists, 1
+ kmovb k6, k3
+ kshiftrw k7, k3, 8
+ vpgatherdq zlens1 {k6}, [next_in + zdists %+ y - 8]
+ vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y - 8]
+
+ vmovdqu8 datas %+ y, [next_in - 8]
+ vpermq zlookup, zdatas_perm2, datas
+ vpshufb zlookup, zlookup, zqword_shuf
+ vpermq zlookup2, zdatas_perm3, datas
+ vpshufb zlookup2, zlookup2, zqword_shuf
+
+ vpxorq zlens1, zlens1, zlookup
+ vpxorq zlens2, zlens2, zlookup2
+
+ vpshufb zlens1, zlens1, zbswap
+ vpshufb zlens2, zlens2, zbswap
+ vplzcntq zlens1, zlens1
+ vplzcntq zlens2, zlens2
+ vpmovqd zlens1 %+ y, zlens1
+ vpmovqd zlens2 %+ y, zlens2
+ vinserti32x8 zlens1, zlens2 %+ y, 1
+ vpsrld zlens1 {k3}{z}, zlens1, 3
+
+ vpandd zmatch_lookup2 {k3}{z}, zmatch_lookup2, znlen_mask
+ vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, ztwosixtytwo
+ vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, zlens1
+
+ vmovdqu32 [match_lookup - ICF_CODE_BYTES * VECT_SIZE] {k3}, zmatch_lookup2
+
+ vpcmpgtd k3, zlens1, zlong_lens2
+ ktestw k3, k3
+ jz .fill_loop
+
+ vpsubd zdists, zincrement, zdists
+
+ vpcompressd zdists2 {k3}, zdists
+ vpcompressd zmatch_lookup2 {k3}, zmatch_lookup2
+ kmovq match_offset, k3
+ tzcnt match_offset, match_offset
+
+ vmovd dist %+ d, zdists2 %+ x
+ lea next_in, [next_in + match_offset - VECT_SIZE]
+ lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
+ mov match_in, next_in
+ sub match_in, dist
+
+ mov len, 16
+ mov tmp2, end_in
+ sub tmp2, next_in
+
+ compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2
+
+ vpbroadcastd zlens1, len %+ d
+ vpsubd zlens1, zlens1, zincrement
+ vpaddd zlens1, zlens1, ztwofiftyfour
+
+ mov tmp2, end_processed
+ sub tmp2, next_in
+ cmp len, tmp2
+ cmovg len, tmp2
+
+ add next_in, len
+ lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
+ vmovdqu32 zmatch_lookup, [match_lookup]
+
+ vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x
+ vpandd zmatch_lookup2, zmatch_lookup2, znlen_mask
+
+ neg len
+
+.update_match_lookup:
+ vpandd zlens2, zlens_mask, [match_lookup + ICF_CODE_BYTES * len]
+ vpcmpgtd k3, zlens1, zlens2
+ vpcmpgtd k4, zlens1, ztwofiftysix
+ kandw k3, k3, k4
+
+ vpcmpgtd k4, zlens1, zmax_len
+ vmovdqu32 zlens, zlens1
+ vmovdqu32 zlens {k4}, zmax_len
+
+ vpaddd zlens2 {k3}{z}, zlens, zmatch_lookup2
+
+ vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2
+
+ knotw k3, k3
+ ktestw k3, k3
+ jnz .fill_loop
+
+ add len, VECT_SIZE
+ vpsubd zlens1, zlens1, zvect_size
+
+ jmp .update_match_lookup
+
+.end_fill:
+ mov end_processed, end_processed_orig
+ cmp next_in, end_processed
+ jge .finish
+
+ mov tmp1, end_processed
+ sub tmp1, next_in
+ vpbroadcastd ztmp1, tmp1 %+ d
+ vpcmpd k3, ztmp1, zincrement, 6
+ vmovdqu32 zmatch_lookup2 {k3}{z}, zmatch_lookup2
+ jmp .finish_entry
+
+.finish:
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 64
+;; 64 byte data
+dist_start:
+ dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
+ dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
+ dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
+ dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
+qword_shuf:
+ db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
+ db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
+ db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
+ db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb
+ db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc
+ db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
+ db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
+ db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+
+;; 16 byte data
+increment:
+ dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+
+datas_perm2:
+ dq 0x0, 0x1
+datas_perm3:
+ dq 0x1, 0x2
+bswap_shuf:
+ db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+ db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+
+;; 4 byte data
+len_mask:
+ dd LIT_LEN_MASK
+dists_mask:
+ dd LIT_DIST_MASK
+long_len:
+ dd 0x105
+long_len2:
+ dd 0x7
+max_len:
+ dd 0xfe + 0x102
+vect_size:
+ dd VECT_SIZE
+twofiftyfour:
+ dd 0xfe
+twofiftysix:
+ dd 0x100
+twosixtytwo:
+ dd 0x106
+nlen_mask:
+ dd 0xfffffc00
+%endif