summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/encode_df_asm.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/isa-l/igzip/encode_df_asm.asm527
1 files changed, 527 insertions, 0 deletions
diff --git a/src/isa-l/igzip/encode_df_asm.asm b/src/isa-l/igzip/encode_df_asm.asm
new file mode 100644
index 00000000..62ada29c
--- /dev/null
+++ b/src/isa-l/igzip/encode_df_asm.asm
@@ -0,0 +1,527 @@
+%include "reg_sizes.asm"
+%include "lz0a_const.asm"
+%include "data_struct2.asm"
+%include "stdmac.asm"
+
+; tree entry is 4 bytes:
+; lit/len tree (513 entries)
+; | 3 | 2 | 1 | 0 |
+; | len | code |
+;
+; dist tree
+; | 3 | 2 | 1 | 0 |
+; |eblen:codlen| code |
+
+; token format:
+; DIST_OFFSET:0 : lit/len
+; 31:(DIST_OFFSET + 5) : dist Extra Bits
+; (DIST_OFFSET + 5):DIST_OFFSET : dist code
+; lit/len: 0-256 (literal)
+; 257-512 (dist + 254)
+
+; returns final token pointer
+; equal to token_end if successful
+; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
+; BitBuf *out_buf, uint32_t *trees);
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define sym rsi
+%define dsym rdi
+%define hufftables r9
+%define ptr r11
+%else
+; Linux
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define sym r9
+%define dsym r8
+%define hufftables r11
+%define ptr rdi
+%endif
+
+%define in_buf_end arg2
+%define bitbuf arg3
+%define out_buf bitbuf
+; bit_count is rcx
+%define bits rax
+%define data r12
+%define tmp rbx
+%define len dsym
+%define tmp2 r10
+%define end_ptr rbp
+
+%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
+%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
+
+%define codes1 ymm1
+%define code_lens1 ymm2
+%define codes2 ymm3
+%define code_lens2 ymm4
+%define codes3 ymm5
+%define code_lens3 ymm6
+%define codes4 ymm7
+%define syms ymm7
+
+%define code_lens4 ymm8
+%define dsyms ymm8
+
+%define ytmp ymm9
+%define codes_lookup1 ymm10
+%define codes_lookup2 ymm11
+%define datas ymm12
+%define ybits ymm13
+%define ybits_count ymm14
+%define yoffset_mask ymm15
+
+%define VECTOR_SIZE 0x20
+%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
+%define VECTOR_SLOP 0x20 - 8
+
+gpr_save_mem_offset equ 0
+gpr_save_mem_size equ 8 * 6
+xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
+xmm_save_mem_size equ 10 * 16
+bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
+bitbuf_mem_size equ 8
+stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
+
+
+%macro FUNC_SAVE 0
+ sub rsp, stack_size
+ mov [rsp + gpr_save_mem_offset + 0*8], rbx
+ mov [rsp + gpr_save_mem_offset + 1*8], rbp
+ mov [rsp + gpr_save_mem_offset + 2*8], r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + gpr_save_mem_offset + 3*8], rsi
+ mov [rsp + gpr_save_mem_offset + 4*8], rdi
+
+ MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
+ MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
+ MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
+ MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
+ MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
+ MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
+ MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
+ MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
+ MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
+ MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
+%endif
+
+%endm
+
+%macro FUNC_RESTORE 0
+ mov rbx, [rsp + gpr_save_mem_offset + 0*8]
+ mov rbp, [rsp + gpr_save_mem_offset + 1*8]
+ mov r12, [rsp + gpr_save_mem_offset + 2*8]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + gpr_save_mem_offset + 3*8]
+ mov rdi, [rsp + gpr_save_mem_offset + 4*8]
+
+ MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
+ MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
+ MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
+ MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
+ MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
+ MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
+ MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
+ MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
+ MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
+ MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
+%endif
+ add rsp, stack_size
+
+%endmacro
+
+global encode_deflate_icf_ %+ ARCH
+encode_deflate_icf_ %+ ARCH:
+ FUNC_SAVE
+
+%ifnidn ptr, arg1
+ mov ptr, arg1
+%endif
+%ifnidn hufftables, arg4
+ mov hufftables, arg4
+%endif
+
+ mov [rsp + bitbuf_mem_offset], bitbuf
+ mov bits, [bitbuf + _m_bits]
+ mov ecx, [bitbuf + _m_bit_count]
+ mov end_ptr, [bitbuf + _m_out_end]
+ mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
+
+ sub end_ptr, VECTOR_SLOP
+ sub in_buf_end, VECTOR_LOOP_PROCESSED
+ cmp ptr, in_buf_end
+ jge .finish
+
+ vpcmpeqq ytmp, ytmp, ytmp
+ vmovdqu datas, [ptr]
+ vpand syms, datas, [lit_mask]
+ vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+
+ vpcmpeqq ytmp, ytmp, ytmp
+ vpsrld dsyms, datas, DIST_OFFSET
+ vpand dsyms, dsyms, [dist_mask]
+ vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+
+ vmovq ybits %+ x, bits
+ vmovq ybits_count %+ x, rcx
+ vmovdqa yoffset_mask, [offset_mask]
+
+.main_loop:
+ ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
+ vpsrld code_lens1, codes_lookup1, 24
+ vpand codes1, codes_lookup1, [lit_icr_mask]
+
+ ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
+ ;; and code_lens3 the extra bit counts
+ vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
+ vpsrld code_lens2, codes_lookup2, 24
+ vpsrld code_lens3, codes_lookup2, 16
+ vpand code_lens3, [eb_icr_mask]
+
+ ;; Set codes3 to contain the extra bits
+ vpsrld codes3, datas, EXTRA_BITS_OFFSET
+
+ cmp out_buf, end_ptr
+ ja .main_loop_exit
+
+ ;; Start code lookups for next iteration
+ add ptr, VECTOR_SIZE
+ vpcmpeqq ytmp, ytmp, ytmp
+ vmovdqu datas, [ptr]
+ vpand syms, datas, [lit_mask]
+ vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+
+ vpcmpeqq ytmp, ytmp, ytmp
+ vpsrld dsyms, datas, DIST_OFFSET
+ vpand dsyms, dsyms, [dist_mask]
+ vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+
+ ;; Merge dist code with extra bits
+ vpsllvd codes3, codes3, code_lens2
+ vpxor codes2, codes2, codes3
+ vpaddd code_lens2, code_lens2, code_lens3
+
+ ;; Check for long codes
+ vpaddd code_lens3, code_lens1, code_lens2
+ vpcmpgtd ytmp, code_lens3, [max_write_d]
+ vptest ytmp, ytmp
+ jnz .long_codes
+
+ ;; Merge dist and len codes
+ vpsllvd codes2, codes2, code_lens1
+ vpxor codes1, codes1, codes2
+
+ ;; Split buffer data into qwords, ytmp is 0 after last branch
+ vpblendd codes3, ytmp, codes1, 0x55
+ vpsrlq codes1, codes1, 32
+ vpsrlq code_lens1, code_lens3, 32
+ vpblendd code_lens3, ytmp, code_lens3, 0x55
+
+ ;; Merge bitbuf bits
+ vpsllvq codes3, codes3, ybits_count
+ vpxor codes3, codes3, ybits
+ vpaddq code_lens3, code_lens3, ybits_count
+
+ ;; Merge two symbols into qwords
+ vpsllvq codes1, codes1, code_lens3
+ vpxor codes1, codes1, codes3
+ vpaddq code_lens1, code_lens1, code_lens3
+
+ ;; Split buffer data into dqwords, ytmp is 0 after last branch
+ vpblendd codes2, ytmp, codes1, 0x33
+ vpblendd code_lens2, ytmp, code_lens1, 0x33
+ vpsrldq codes1, 8
+ vpsrldq code_lens1, 8
+
+ ;; Merge two qwords into dqwords
+ vmovdqa ytmp, [q_64]
+ vpsubq code_lens3, ytmp, code_lens2
+ vpsrlvq codes3, codes1, code_lens3
+ vpslldq codes3, codes3, 8
+
+ vpsllvq codes1, codes1, code_lens2
+
+ vpxor codes1, codes1, codes3
+ vpxor codes1, codes1, codes2
+ vpaddq code_lens1, code_lens1, code_lens2
+
+ vmovq tmp, code_lens1 %+ x ;Number of bytes
+ shr tmp, 3
+ vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
+
+ ;; bit shift upper dqword combined bits to line up with lower dqword
+ vextracti128 codes2 %+ x, codes1, 1
+ vextracti128 code_lens2 %+ x, code_lens1, 1
+
+ vpbroadcastq ybits_count, ybits_count %+ x
+ vpsrldq codes3, codes2, 1
+ vpsllvq codes2, codes2, ybits_count
+ vpsllvq codes3, codes3, ybits_count
+ vpslldq codes3, codes3, 1
+ vpor codes2, codes2, codes3
+
+ ; Write out lower dqword of combined bits
+ vmovdqu [out_buf], codes1
+ movzx bits, byte [out_buf + tmp]
+ vmovq codes1 %+ x, bits
+ vpaddq code_lens1, code_lens1, code_lens2
+
+ vmovq tmp2, code_lens1 %+ x ;Number of bytes
+ shr tmp2, 3
+ vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
+
+ ; Write out upper dqword of combined bits
+ vpor codes1 %+ x, codes1 %+ x, codes2 %+ x
+ vmovdqu [out_buf + tmp], codes1 %+ x
+ add out_buf, tmp2
+ movzx bits, byte [out_buf]
+ vmovq ybits %+ x, bits
+
+ cmp ptr, in_buf_end
+ jbe .main_loop
+
+.main_loop_exit:
+ vmovq rcx, ybits_count %+ x
+ vmovq bits, ybits %+ x
+ jmp .finish
+
+.long_codes:
+ add end_ptr, VECTOR_SLOP
+ sub ptr, VECTOR_SIZE
+
+ vpxor ytmp, ytmp, ytmp
+ vpblendd codes3, ytmp, codes1, 0x55
+ vpblendd code_lens3, ytmp, code_lens1, 0x55
+ vpblendd codes4, ytmp, codes2, 0x55
+
+ vpsllvq codes4, codes4, code_lens3
+ vpxor codes3, codes3, codes4
+ vpaddd code_lens3, code_lens1, code_lens2
+
+ vpsrlq codes1, codes1, 32
+ vpsrlq code_lens1, code_lens1, 32
+ vpsrlq codes2, codes2, 32
+
+ vpsllvq codes2, codes2, code_lens1
+ vpxor codes1, codes1, codes2
+
+ vpsrlq code_lens1, code_lens3, 32
+ vpblendd code_lens3, ytmp, code_lens3, 0x55
+
+ ;; Merge bitbuf bits
+ vpsllvq codes3, codes3, ybits_count
+ vpxor codes3, codes3, ybits
+ vpaddq code_lens3, code_lens3, ybits_count
+ vpaddq code_lens1, code_lens1, code_lens3
+
+ xor bits, bits
+ xor rcx, rcx
+ vpsubq code_lens1, code_lens1, code_lens3
+%rep 2
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ cmp out_buf, end_ptr
+ ja .overflow
+ ;; insert LL code
+ vmovq sym, codes3 %+ x
+ vmovq tmp2, code_lens3 %+ x
+ SHLX sym, sym, rcx
+ or bits, sym
+ add rcx, tmp2
+
+ ; empty bits
+ mov [out_buf], bits
+ mov tmp, rcx
+ shr tmp, 3 ; byte count
+ add out_buf, tmp
+ mov tmp, rcx
+ and rcx, ~7
+ SHRX bits, bits, rcx
+ mov rcx, tmp
+ and rcx, 7
+ add ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ cmp out_buf, end_ptr
+ ja .overflow
+ ;; insert LL code
+ vmovq sym, codes1 %+ x
+ vmovq tmp2, code_lens1 %+ x
+ SHLX sym, sym, rcx
+ or bits, sym
+ add rcx, tmp2
+
+ ; empty bits
+ mov [out_buf], bits
+ mov tmp, rcx
+ shr tmp, 3 ; byte count
+ add out_buf, tmp
+ mov tmp, rcx
+ and rcx, ~7
+ SHRX bits, bits, rcx
+ mov rcx, tmp
+ and rcx, 7
+ add ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ cmp out_buf, end_ptr
+ ja .overflow
+ ;; insert LL code
+ vpextrq sym, codes3 %+ x, 1
+ vpextrq tmp2, code_lens3 %+ x, 1
+ SHLX sym, sym, rcx
+ or bits, sym
+ add rcx, tmp2
+
+ ; empty bits
+ mov [out_buf], bits
+ mov tmp, rcx
+ shr tmp, 3 ; byte count
+ add out_buf, tmp
+ mov tmp, rcx
+ and rcx, ~7
+ SHRX bits, bits, rcx
+ mov rcx, tmp
+ and rcx, 7
+ add ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ cmp out_buf, end_ptr
+ ja .overflow
+ ;; insert LL code
+ vpextrq sym, codes1 %+ x, 1
+ vpextrq tmp2, code_lens1 %+ x, 1
+ SHLX sym, sym, rcx
+ or bits, sym
+ add rcx, tmp2
+
+ ; empty bits
+ mov [out_buf], bits
+ mov tmp, rcx
+ shr tmp, 3 ; byte count
+ add out_buf, tmp
+ mov tmp, rcx
+ and rcx, ~7
+ SHRX bits, bits, rcx
+ mov rcx, tmp
+ and rcx, 7
+ add ptr, 4
+
+ vextracti128 codes3 %+ x, codes3, 1
+ vextracti128 code_lens3 %+ x, code_lens3, 1
+ vextracti128 codes1 %+ x, codes1, 1
+ vextracti128 code_lens1 %+ x, code_lens1, 1
+%endrep
+ sub end_ptr, VECTOR_SLOP
+
+ vmovq ybits %+ x, bits
+ vmovq ybits_count %+ x, rcx
+ cmp ptr, in_buf_end
+ jbe .main_loop
+
+.finish:
+ add in_buf_end, VECTOR_LOOP_PROCESSED
+ add end_ptr, VECTOR_SLOP
+
+ cmp ptr, in_buf_end
+ jge .overflow
+
+.finish_loop:
+ mov DWORD(data), [ptr]
+
+ cmp out_buf, end_ptr
+ ja .overflow
+
+ mov sym, data
+ and sym, LIT_MASK ; sym has ll_code
+ mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
+
+ ; look up dist sym
+ mov dsym, data
+ shr dsym, DIST_OFFSET
+ and dsym, DIST_MASK
+ mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
+
+ ; insert LL code
+ ; sym: 31:24 length; 23:0 code
+ mov tmp2, sym
+ and sym, 0xFFFFFF
+ SHLX sym, sym, rcx
+ shr tmp2, 24
+ or bits, sym
+ add rcx, tmp2
+
+ ; insert dist code
+ movzx tmp, WORD(dsym)
+ SHLX tmp, tmp, rcx
+ or bits, tmp
+ mov tmp, dsym
+ shr tmp, 24
+ add rcx, tmp
+
+ ; insert dist extra bits
+ shr data, EXTRA_BITS_OFFSET
+ add ptr, 4
+ SHLX data, data, rcx
+ or bits, data
+ shr dsym, 16
+ and dsym, 0xFF
+ add rcx, dsym
+
+ ; empty bits
+ mov [out_buf], bits
+ mov tmp, rcx
+ shr tmp, 3 ; byte count
+ add out_buf, tmp
+ mov tmp, rcx
+ and rcx, ~7
+ SHRX bits, bits, rcx
+ mov rcx, tmp
+ and rcx, 7
+
+ cmp ptr, in_buf_end
+ jb .finish_loop
+
+.overflow:
+ mov tmp, [rsp + bitbuf_mem_offset]
+ mov [tmp + _m_bits], bits
+ mov [tmp + _m_bit_count], ecx
+ mov [tmp + _m_out_buf], out_buf
+
+ mov rax, ptr
+
+ FUNC_RESTORE
+
+ ret
+
+section .data
+ align 32
+max_write_d:
+ dd 0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
+offset_mask:
+ dq 0x0000000000000007, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+q_64:
+ dq 0x0000000000000040, 0x0000000000000000
+ dq 0x0000000000000040, 0x0000000000000000
+lit_mask:
+ dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+ dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+dist_mask:
+ dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+ dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+lit_icr_mask:
+ dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
+ dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
+eb_icr_mask:
+ dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
+ dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF