From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 20:45:59 +0200
Subject: Adding upstream version 16.2.11+ds.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/isa-l/igzip/encode_df_06.asm | 620 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 620 insertions(+)
 create mode 100644 src/isa-l/igzip/encode_df_06.asm

(limited to 'src/isa-l/igzip/encode_df_06.asm')

diff --git a/src/isa-l/igzip/encode_df_06.asm b/src/isa-l/igzip/encode_df_06.asm
new file mode 100644
index 000000000..9fa516326
--- /dev/null
+++ b/src/isa-l/igzip/encode_df_06.asm
@@ -0,0 +1,620 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "lz0a_const.asm"
+%include "data_struct2.asm"
+%include "stdmac.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%define ARCH 06
+%define USE_HSWNI
+
+; tree entry is 4 bytes:
+; lit/len tree (513 entries)
+; |  3  |  2   |  1 | 0 |
+; | len |       code    |
+;
+; dist tree
+; |  3  |  2   |  1 | 0 |
+; |eblen:codlen|   code |
+
+; token format:
+; DIST_OFFSET:0 : lit/len
+; 31:(DIST_OFFSET + 5) : dist Extra Bits
+; (DIST_OFFSET + 5):DIST_OFFSET : dist code
+; lit/len: 0-256 (literal)
+;          257-512 (dist + 254)
+
+; returns final token pointer
+; equal to token_end if successful
+;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
+;                            BitBuf *out_buf, uint32_t *trees);
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define sym		rsi
+%define dsym		rdi
+%define hufftables	r9
+%define ptr		r11
+%else
+; Linux
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define sym		r9
+%define dsym		r8
+%define hufftables	r11
+%define ptr		rdi
+%endif
+
+%define in_buf_end	arg2
+%define bitbuf		arg3
+%define out_buf		bitbuf
+; bit_count is rcx
+%define bits		rax
+%define data		r12
+%define tmp		rbx
+%define len 		dsym
+%define tmp2 		r10
+%define end_ptr		rbp
+
+%define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
+%define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)
+
+%define codes1		zmm1
+%define code_lens1	zmm2
+%define codes2		zmm3
+%define code_lens2	zmm4
+%define codes3		zmm5
+%define ztmp		zmm5
+%define	code_lens3	zmm6
+%define codes4		zmm7
+%define syms		zmm7
+
+%define code_lens4	zmm8
+%define dsyms		zmm8
+%define zbits_count_q	zmm8
+
+%define codes_lookup1	zmm9
+%define	codes_lookup2	zmm10
+%define datas		zmm11
+%define zbits		zmm12
+%define zbits_count	zmm13
+%define zoffset_mask	zmm14
+%define znotoffset_mask	zmm23
+
+%define zq_64		zmm15
+%define zlit_mask	zmm16
+%define zdist_mask	zmm17
+%define zlit_icr_mask	zmm18
+%define zeb_icr_mask	zmm19
+%define zmax_write	zmm20
+%define zrot_perm	zmm21
+%define zq_8		zmm22
+
+%define VECTOR_SIZE 0x40
+%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
+%define VECTOR_SLOP 0x40 - 8
+
+gpr_save_mem_offset	equ	0
+gpr_save_mem_size	equ	8 * 6
+xmm_save_mem_offset	equ	gpr_save_mem_offset + gpr_save_mem_size
+xmm_save_mem_size	equ	10 * 16
+bitbuf_mem_offset	equ	xmm_save_mem_offset + xmm_save_mem_size
+bitbuf_mem_size		equ	8
+stack_size		equ	gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
+
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	mov	[rsp + gpr_save_mem_offset + 0*8], rbx
+	mov	[rsp + gpr_save_mem_offset + 1*8], rbp
+	mov	[rsp + gpr_save_mem_offset + 2*8], r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	[rsp + gpr_save_mem_offset + 3*8], rsi
+	mov	[rsp + gpr_save_mem_offset + 4*8], rdi
+
+	MOVDQU	[rsp + xmm_save_mem_offset + 0*8], xmm6
+	MOVDQU	[rsp + xmm_save_mem_offset + 1*8], xmm7
+	MOVDQU	[rsp + xmm_save_mem_offset + 2*8], xmm8
+	MOVDQU	[rsp + xmm_save_mem_offset + 3*8], xmm9
+	MOVDQU	[rsp + xmm_save_mem_offset + 4*8], xmm10
+	MOVDQU	[rsp + xmm_save_mem_offset + 5*8], xmm11
+	MOVDQU	[rsp + xmm_save_mem_offset + 6*8], xmm12
+	MOVDQU	[rsp + xmm_save_mem_offset + 7*8], xmm13
+	MOVDQU	[rsp + xmm_save_mem_offset + 8*8], xmm14
+	MOVDQU	[rsp + xmm_save_mem_offset + 9*8], xmm15
+%endif
+
+%endm
+
+%macro FUNC_RESTORE 0
+	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
+	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
+	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	rsi, [rsp + gpr_save_mem_offset + 3*8]
+	mov	rdi, [rsp + gpr_save_mem_offset + 4*8]
+
+	MOVDQU	xmm6, [rsp + xmm_save_mem_offset + 0*8]
+	MOVDQU	xmm7, [rsp + xmm_save_mem_offset + 1*8]
+	MOVDQU	xmm8, [rsp + xmm_save_mem_offset + 2*8]
+	MOVDQU	xmm9, [rsp + xmm_save_mem_offset + 3*8]
+	MOVDQU	xmm10, [rsp + xmm_save_mem_offset + 4*8]
+	MOVDQU	xmm11, [rsp + xmm_save_mem_offset + 5*8]
+	MOVDQU	xmm12, [rsp + xmm_save_mem_offset + 6*8]
+	MOVDQU	xmm13, [rsp + xmm_save_mem_offset + 7*8]
+	MOVDQU	xmm14, [rsp + xmm_save_mem_offset + 8*8]
+	MOVDQU	xmm15, [rsp + xmm_save_mem_offset + 9*8]
+%endif
+	add	rsp, stack_size
+
+%endmacro
+
+global encode_deflate_icf_ %+ ARCH
+encode_deflate_icf_ %+ ARCH:
+	FUNC_SAVE
+
+%ifnidn ptr, arg1
+	mov	ptr, arg1
+%endif
+%ifnidn hufftables, arg4
+	mov	hufftables, arg4
+%endif
+
+	mov	[rsp + bitbuf_mem_offset], bitbuf
+	mov	bits, [bitbuf + _m_bits]
+	mov	ecx, [bitbuf + _m_bit_count]
+	mov	end_ptr, [bitbuf + _m_out_end]
+	mov	out_buf, [bitbuf + _m_out_buf]	; clobbers bitbuf
+
+	sub	end_ptr, VECTOR_SLOP
+	sub	in_buf_end, VECTOR_LOOP_PROCESSED
+	cmp	ptr, in_buf_end
+	jge	.finish
+
+	kxorq	k0, k0, k0
+	kmovq	k1, [k_mask_1]
+	kmovq	k2, [k_mask_2]
+	kmovq	k3, [k_mask_3]
+	kmovq	k4, [k_mask_4]
+	kmovq	k5, [k_mask_5]
+
+	vmovdqa64 zrot_perm, [rot_perm]
+
+	vbroadcasti64x2 zq_64, [q_64]
+	vbroadcasti64x2 zq_8, [q_8]
+
+	vpbroadcastq zoffset_mask, [offset_mask]
+	vpternlogd znotoffset_mask, znotoffset_mask, zoffset_mask, 0x55
+
+	vpbroadcastd zlit_mask, [lit_mask]
+	vpbroadcastd zdist_mask, [dist_mask]
+	vpbroadcastd zlit_icr_mask, [lit_icr_mask]
+	vpbroadcastd zeb_icr_mask, [eb_icr_mask]
+	vpbroadcastd zmax_write, [max_write_d]
+
+	knotq	k6, k0
+	vmovdqu64	datas, [ptr]
+	vpandd	syms, datas, zlit_mask
+	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
+
+	knotq	k7, k0
+	vpsrld	dsyms, datas, DIST_OFFSET
+	vpandd	dsyms, dsyms, zdist_mask
+	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
+
+	vmovq	zbits %+ x, bits
+	vmovq	zbits_count %+ x, rcx
+
+.main_loop:
+	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
+	vpsrld	code_lens1, codes_lookup1, 24
+	vpandd	codes1, codes_lookup1, zlit_icr_mask
+
+	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
+	;; and code_lens3 the extra bit counts
+	vmovdqu16	codes2 {k1}{z}, codes_lookup2 ;Bits 8 and above of zbits are 0
+	vpsrld	code_lens2, codes_lookup2, 24
+	vpsrld	code_lens3, codes_lookup2, 16
+	vpandd	code_lens3, code_lens3, zeb_icr_mask
+
+	;; Set codes3 to contain the extra bits
+	vpsrld	codes3, datas, EXTRA_BITS_OFFSET
+
+	cmp	out_buf, end_ptr
+	ja	.main_loop_exit
+
+	;; Start code lookups for next iteration
+	knotq	k6, k0
+	add	ptr, VECTOR_SIZE
+	vmovdqu64	datas, [ptr]
+	vpandd	syms, datas, zlit_mask
+	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
+
+	knotq	k7, k0
+	vpsrld	dsyms, datas, DIST_OFFSET
+	vpandd	dsyms, dsyms, zdist_mask
+	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
+
+	;; Merge dist code with extra bits
+	vpsllvd	codes3, codes3, code_lens2
+	vpxord	codes2, codes2, codes3
+	vpaddd	code_lens2, code_lens2, code_lens3
+
+	;; Check for long codes
+	vpaddd	code_lens3, code_lens1, code_lens2
+	vpcmpgtd	k6, code_lens3, zmax_write
+	ktestd	k6, k6
+	jnz	.long_codes
+
+	;; Merge dist and len codes
+	vpsllvd	codes2, codes2, code_lens1
+	vpxord	codes1, codes1, codes2
+
+	vmovdqa32 codes3 {k1}{z}, codes1
+	vpsrlq	codes1, codes1, 32
+	vpsrlq	code_lens1, code_lens3, 32
+	vmovdqa32	code_lens3 {k1}{z}, code_lens3
+
+	;; Merge bitbuf bits
+	vpsllvq codes3, codes3, zbits_count
+	vpxord	codes3, codes3, zbits
+	vpaddq	code_lens3, code_lens3, zbits_count
+
+	;; Merge two symbols into qwords
+	vpsllvq	codes1, codes1, code_lens3
+	vpxord codes1, codes1, codes3
+	vpaddq code_lens1, code_lens1, code_lens3
+
+	;; Determine total bits at end of each qword
+	vpermq	zbits_count {k5}{z}, zrot_perm, code_lens1
+	vpaddq	code_lens2, zbits_count, code_lens1
+	vshufi64x2 zbits_count {k3}{z}, code_lens2, code_lens2, 0x90
+	vpaddq	code_lens2, code_lens2, zbits_count
+	vshufi64x2 zbits_count {k2}{z}, code_lens2, code_lens2, 0x40
+	vpaddq	code_lens2, code_lens2, zbits_count
+
+	;; Bit align quadwords
+	vpandd	zbits_count, code_lens2, zoffset_mask
+	vpermq	zbits_count_q {k5}{z}, zrot_perm, zbits_count
+	vpsllvq	codes1, codes1, zbits_count_q
+
+	;; Check whether any of the last bytes overlap
+	vpcmpq k6 {k5}, code_lens1, zbits_count, 1
+
+	;; Get last byte in each qword
+	vpsrlq	code_lens2, code_lens2, 3
+	vpaddq	code_lens1, code_lens1, zbits_count_q
+	vpandq	code_lens1, code_lens1, znotoffset_mask
+	vpsrlvq	codes3, codes1, code_lens1
+
+	;; Branch to handle overlapping last bytes
+	ktestd k6, k6
+	jnz .small_codes
+
+.small_codes_next:
+	;; Save off zbits and zbits_count for next loop
+	knotq	k7, k5
+	vpermq	zbits {k7}{z}, zrot_perm, codes3
+	vpermq	zbits_count {k7}{z}, zrot_perm, zbits_count
+
+	;; Merge last byte in each qword with the next qword
+	vpermq	codes3 {k5}{z}, zrot_perm, codes3
+	vpxord codes1, codes1, codes3
+
+	;; Determine total bytes written
+	vextracti64x2 code_lens1 %+ x, code_lens2, 3
+	vpextrq tmp2, code_lens1 %+ x, 1
+
+	;; Write out qwords
+	knotq	k6, k0
+	vpermq code_lens2 {k5}{z}, zrot_perm, code_lens2
+	vpscatterqq [out_buf + code_lens2] {k6}, codes1
+
+	add	out_buf, tmp2
+
+	cmp	ptr, in_buf_end
+	jbe	.main_loop
+
+.main_loop_exit:
+	vmovq	rcx, zbits_count %+ x
+	vmovq	bits, zbits %+ x
+	jmp	.finish
+
+.small_codes:
+	;; Merge overlapping last bytes
+	vpermq	codes4 {k6}{z}, zrot_perm, codes3
+	vporq codes3, codes3, codes4
+	kshiftlq k7, k6, 1
+	ktestd k6, k7
+	jz .small_codes_next
+
+	kandq k6, k6, k7
+	jmp .small_codes
+
+.long_codes:
+	add	end_ptr, VECTOR_SLOP
+	sub	ptr, VECTOR_SIZE
+
+	vmovdqa32 codes3 {k1}{z}, codes1
+	vmovdqa32 code_lens3 {k1}{z}, code_lens1
+	vmovdqa32 codes4 {k1}{z}, codes2
+
+	vpsllvq	codes4, codes4, code_lens3
+	vpxord	codes3, codes3, codes4
+	vpaddd	code_lens3, code_lens1, code_lens2
+
+	vpsrlq	codes1, codes1, 32
+	vpsrlq	code_lens1, code_lens1, 32
+	vpsrlq	codes2, codes2, 32
+
+	vpsllvq	codes2, codes2, code_lens1
+	vpxord codes1, codes1, codes2
+
+	vpsrlq code_lens1, code_lens3, 32
+	vmovdqa32	code_lens3 {k1}{z}, code_lens3
+
+	;; Merge bitbuf bits
+	vpsllvq codes3, codes3, zbits_count
+	vpxord	codes3, codes3, zbits
+	vpaddq	code_lens3, code_lens3, zbits_count
+	vpaddq code_lens1, code_lens1, code_lens3
+
+	xor	bits, bits
+	xor	rcx, rcx
+	vpsubq code_lens1, code_lens1, code_lens3
+
+	vmovdqu64 codes2, codes1
+	vmovdqu64 code_lens2, code_lens1
+	vmovdqu64 codes4, codes3
+	vmovdqu64 code_lens4, code_lens3
+%assign i 0
+%rep 4
+%assign i (i + 1)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vmovq	sym, codes3 %+ x
+	vmovq	tmp2, code_lens3 %+ x
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vmovq	sym, codes1 %+ x
+	vmovq	tmp2, code_lens1 %+ x
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vpextrq	sym, codes3 %+ x, 1
+	vpextrq	tmp2, code_lens3 %+ x, 1
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vpextrq	sym, codes1 %+ x, 1
+	vpextrq	tmp2, code_lens1 %+ x, 1
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+	vextracti32x4 codes3 %+ x, codes4, i
+	vextracti32x4 code_lens3 %+ x, code_lens4, i
+	vextracti32x4 codes1 %+ x, codes2, i
+	vextracti32x4 code_lens1 %+ x, code_lens2, i
+%endrep
+	sub	end_ptr, VECTOR_SLOP
+
+	vmovq	zbits %+ x, bits
+	vmovq	zbits_count %+ x, rcx
+	cmp	ptr, in_buf_end
+	jbe	.main_loop
+
+.finish:
+	add	in_buf_end, VECTOR_LOOP_PROCESSED
+	add	end_ptr, VECTOR_SLOP
+
+	cmp	ptr, in_buf_end
+	jge	.overflow
+
+.finish_loop:
+	mov	DWORD(data), [ptr]
+
+	cmp	out_buf, end_ptr
+	ja	.overflow
+
+	mov	sym, data
+	and	sym, LIT_MASK	; sym has ll_code
+	mov	DWORD(sym), [hufftables + _lit_len_table + sym * 4]
+
+	; look up dist sym
+	mov	dsym, data
+	shr	dsym, DIST_OFFSET
+	and	dsym, DIST_MASK
+	mov	DWORD(dsym), [hufftables + _dist_table + dsym * 4]
+
+	; insert LL code
+	; sym: 31:24 length; 23:0 code
+	mov	tmp2, sym
+	and	sym, 0xFFFFFF
+	SHLX	sym, sym, rcx
+	shr	tmp2, 24
+	or	bits, sym
+	add	rcx, tmp2
+
+	; insert dist code
+	movzx	tmp, WORD(dsym)
+	SHLX	tmp, tmp, rcx
+	or	bits, tmp
+	mov	tmp, dsym
+	shr	tmp, 24
+	add	rcx, tmp
+
+	; insert dist extra bits
+	shr	data, EXTRA_BITS_OFFSET
+	add	ptr, 4
+	SHLX	data, data, rcx
+	or	bits, data
+	shr	dsym, 16
+	and	dsym, 0xFF
+	add	rcx, dsym
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+
+	cmp	ptr, in_buf_end
+	jb	.finish_loop
+
+.overflow:
+	mov	tmp, [rsp + bitbuf_mem_offset]
+	mov	[tmp + _m_bits], bits
+	mov	[tmp + _m_bit_count], ecx
+	mov	[tmp + _m_out_buf], out_buf
+
+	mov	rax, ptr
+
+	FUNC_RESTORE
+
+	ret
+
+section .data
+	align 64
+;; 64 byte data
+rot_perm:
+	dq 0x00000007, 0x00000000, 0x00000001, 0x00000002
+	dq 0x00000003, 0x00000004, 0x00000005, 0x00000006
+
+;; 16 byte data
+q_64:
+	dq 0x0000000000000040, 0x0000000000000000
+q_8 :
+	dq 0x0000000000000000, 0x0000000000000008
+
+;; 8 byte data
+offset_mask:
+	dq 0x0000000000000007
+
+;; 4 byte data
+max_write_d:
+	dd 0x1c
+lit_mask:
+	dd LIT_MASK
+dist_mask:
+	dd DIST_MASK
+lit_icr_mask:
+	dd 0x00ffffff
+eb_icr_mask:
+	dd 0x000000ff
+
+;; k mask constants
+k_mask_1: dq 0x55555555
+k_mask_2: dq 0xfffffff0
+k_mask_3: dq 0xfffffffc
+k_mask_4: dw 0x0101, 0x0101, 0x0101, 0x0101
+k_mask_5: dq 0xfffffffe
+
+%endif
-- 
cgit v1.2.3