diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm')
-rw-r--r-- | src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm | 795 |
1 files changed, 795 insertions, 0 deletions
diff --git a/src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm b/src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm new file mode 100644 index 000000000..f5e35cd68 --- /dev/null +++ b/src/spdk/isa-l/igzip/igzip_decode_block_stateless.asm @@ -0,0 +1,795 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel + +%include "reg_sizes.asm" + +%define DECOMP_OK 0 +%define END_INPUT 1 +%define OUT_OVERFLOW 2 +%define INVALID_BLOCK -1 +%define INVALID_SYMBOL -2 +%define INVALID_LOOKBACK -3 + +%define ISAL_DECODE_LONG_BITS 12 +%define ISAL_DECODE_SHORT_BITS 10 + +%define COPY_SIZE 16 +%define COPY_LEN_MAX 258 + +%define IN_BUFFER_SLOP 8 +%define OUT_BUFFER_SLOP COPY_SIZE + COPY_LEN_MAX + +%include "inflate_data_structs.asm" +%include "stdmac.asm" + +extern rfc1951_lookup_table + + + +%define LARGE_SHORT_SYM_LEN 25 +%define LARGE_SHORT_SYM_MASK ((1 << LARGE_SHORT_SYM_LEN) - 1) +%define LARGE_LONG_SYM_LEN 10 +%define LARGE_LONG_SYM_MASK ((1 << LARGE_LONG_SYM_LEN) - 1) +%define LARGE_SHORT_CODE_LEN_OFFSET 28 +%define LARGE_LONG_CODE_LEN_OFFSET 10 +%define LARGE_FLAG_BIT_OFFSET 25 +%define LARGE_FLAG_BIT (1 << LARGE_FLAG_BIT_OFFSET) +%define LARGE_SYM_COUNT_OFFSET 26 +%define LARGE_SYM_COUNT_LEN 2 +%define LARGE_SYM_COUNT_MASK ((1 << LARGE_SYM_COUNT_LEN) - 1) +%define LARGE_SHORT_MAX_LEN_OFFSET 26 + +%define SMALL_SHORT_SYM_LEN 9 +%define SMALL_SHORT_SYM_MASK ((1 << SMALL_SHORT_SYM_LEN) - 1) +%define SMALL_LONG_SYM_LEN 9 +%define SMALL_LONG_SYM_MASK ((1 << SMALL_LONG_SYM_LEN) - 1) +%define SMALL_SHORT_CODE_LEN_OFFSET 11 +%define SMALL_LONG_CODE_LEN_OFFSET 10 +%define SMALL_FLAG_BIT_OFFSET 10 +%define SMALL_FLAG_BIT (1 << SMALL_FLAG_BIT_OFFSET) + +%define DIST_SYM_OFFSET 0 +%define DIST_SYM_LEN 5 +%define DIST_SYM_MASK ((1 << DIST_SYM_LEN) - 1) +%define DIST_SYM_EXTRA_OFFSET 5 +%define DIST_SYM_EXTRA_LEN 4 +%define DIST_SYM_EXTRA_MASK ((1 << DIST_SYM_EXTRA_LEN) - 1) + +;; rax +%define tmp3 rax +%define read_in_2 rax +%define look_back_dist rax + +;; rcx +;; rdx arg3 +%define next_sym2 rdx +%define copy_start rdx +%define tmp4 rdx + +;; rdi arg1 +%define tmp1 rdi +%define look_back_dist2 rdi +%define next_bits2 rdi +%define next_sym3 rdi + +;; rsi arg2 +%define tmp2 rsi +%define next_sym_num rsi +%define next_bits rsi + +;; rbx ; Saved +%define next_in rbx + +;; rbp ; Saved +%define end_in rbp + +;; r8 +%define repeat_length r8 + +;; r9 +%define read_in r9 + +;; r10 +%define read_in_length r10 + +;; r11 +%define state r11 + +;; r12 ; Saved +%define next_out r12 + +;; r13 ; Saved +%define end_out r13 + +;; r14 ; Saved +%define next_sym r14 + +;; r15 ; Saved +%define rfc_lookup r15 + +start_out_mem_offset equ 0 +read_in_mem_offset equ 8 +read_in_length_mem_offset equ 16 +next_out_mem_offset equ 24 +gpr_save_mem_offset equ 32 +stack_size equ 4 * 8 + 8 * 8 + +%define _dist_extra_bit_count 264 +%define _dist_start _dist_extra_bit_count + 1*32 +%define _len_extra_bit_count _dist_start + 4*32 +%define _len_start _len_extra_bit_count + 1*32 + +%ifidn __OUTPUT_FORMAT__, elf64 +%define arg0 rdi +%define arg1 rsi + +%macro FUNC_SAVE 0 +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, stack_size + and rsp, ~15 +%else + sub rsp, stack_size +%endif + + mov [rsp + gpr_save_mem_offset + 0*8], rbx + mov [rsp + gpr_save_mem_offset + 1*8], rbp + mov [rsp + gpr_save_mem_offset + 2*8], r12 + mov [rsp + gpr_save_mem_offset + 3*8], r13 + mov [rsp + gpr_save_mem_offset + 4*8], r14 + mov [rsp + gpr_save_mem_offset + 5*8], r15 +%endm + +%macro FUNC_RESTORE 0 + mov rbx, [rsp + gpr_save_mem_offset + 0*8] + mov rbp, [rsp + gpr_save_mem_offset + 1*8] + mov r12, [rsp + gpr_save_mem_offset + 2*8] + mov r13, [rsp + gpr_save_mem_offset + 3*8] + mov r14, [rsp + gpr_save_mem_offset + 4*8] + mov r15, [rsp + gpr_save_mem_offset + 5*8] + +%ifndef ALIGN_STACK + add rsp, stack_size +%else + mov rsp, rbp + pop rbp +%endif +%endm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define arg0 rcx +%define arg1 rdx + +%macro FUNC_SAVE 0 +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, stack_size + and rsp, ~15 +%else + sub rsp, stack_size +%endif + + mov [rsp + gpr_save_mem_offset + 0*8], rbx + mov [rsp + gpr_save_mem_offset + 1*8], rsi + mov [rsp + gpr_save_mem_offset + 2*8], rdi + mov [rsp + gpr_save_mem_offset + 3*8], rbp + mov [rsp + gpr_save_mem_offset + 4*8], r12 + mov [rsp + gpr_save_mem_offset + 5*8], r13 + mov [rsp + gpr_save_mem_offset + 6*8], r14 + mov [rsp + gpr_save_mem_offset + 7*8], r15 +%endm + +%macro FUNC_RESTORE 0 + mov rbx, [rsp + gpr_save_mem_offset + 0*8] + mov rsi, [rsp + gpr_save_mem_offset + 1*8] + mov rdi, [rsp + gpr_save_mem_offset + 2*8] + mov rbp, [rsp + gpr_save_mem_offset + 3*8] + mov r12, [rsp + gpr_save_mem_offset + 4*8] + mov r13, [rsp + gpr_save_mem_offset + 5*8] + mov r14, [rsp + gpr_save_mem_offset + 6*8] + mov r15, [rsp + gpr_save_mem_offset + 7*8] + +%ifndef ALIGN_STACK + add rsp, stack_size +%else + mov rsp, rbp + pop rbp +%endif +%endm +%endif + +;; Load read_in and updated in_buffer accordingly +;; when there are at least 8 bytes in the in buffer +;; Clobbers rcx, unless rcx is %%read_in_length +%macro inflate_in_load 6 +%define %%next_in %1 +%define %%end_in %2 +%define %%read_in %3 +%define %%read_in_length %4 +%define %%tmp1 %5 ; Tmp registers +%define %%tmp2 %6 + + SHLX %%tmp1, [%%next_in], %%read_in_length + or %%read_in, %%tmp1 + + mov %%tmp1, 64 + sub %%tmp1, %%read_in_length + shr %%tmp1, 3 + + add %%next_in, %%tmp1 + lea %%read_in_length, [%%read_in_length + 8 * %%tmp1] +%%end: +%endm + +;; Load read_in and updated in_buffer accordingly +;; Clobbers rcx, unless rcx is %%read_in_length +%macro inflate_in_small_load 6 +%define %%next_in %1 +%define %%end_in %2 +%define %%read_in %3 +%define %%read_in_length %4 +%define %%avail_in %5 ; Tmp registers +%define %%tmp1 %5 +%define %%loop_count %6 + + mov %%avail_in, %%end_in + sub %%avail_in, %%next_in + +%ifnidn %%read_in_length, rcx + mov rcx, %%read_in_length +%endif + + mov %%loop_count, 64 + sub %%loop_count, %%read_in_length + shr %%loop_count, 3 + + cmp %%loop_count, %%avail_in + cmovg %%loop_count, %%avail_in + cmp %%loop_count, 0 + je %%end + +%%load_byte: + xor %%tmp1, %%tmp1 + mov %%tmp1 %+ b, byte [%%next_in] + SHLX %%tmp1, %%tmp1, rcx + or %%read_in, %%tmp1 + add rcx, 8 + add %%next_in, 1 + sub %%loop_count, 1 + jg %%load_byte +%ifnidn %%read_in_length, rcx + mov %%read_in_length, rcx +%endif +%%end: +%endm + +;; Clears all bits at index %%bit_count and above in %%next_bits +;; May clobber rcx and %%bit_count +%macro CLEAR_HIGH_BITS 3 +%define %%next_bits %1 +%define %%bit_count %2 +%define %%lookup_size %3 + + sub %%bit_count, 0x40 + %%lookup_size +;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first DECODE_LOOKUP_SIZE bits. +%ifdef USE_HSWNI + and %%bit_count, 0x1F + bzhi %%next_bits, %%next_bits, %%bit_count +%else +%ifnidn %%bit_count, rcx + mov rcx, %%bit_count +%endif + neg rcx + shl %%next_bits, cl + shr %%next_bits, cl +%endif + +%endm + +;; Decode next symbol +;; Clobber rcx +%macro decode_next_lit_len 8 +%define %%state %1 ; State structure associated with compressed stream +%define %%lookup_size %2 ; Number of bits used for small lookup +%define %%state_offset %3 ; Type of huff code, should be either LIT or DIST +%define %%read_in %4 ; Bits read in from compressed stream +%define %%read_in_length %5 ; Number of valid bits in read_in +%define %%next_sym %6 ; Returned symbols +%define %%next_sym_num %7 ; Returned symbols count +%define %%next_bits %8 + + mov %%next_sym_num, %%next_sym + mov rcx, %%next_sym + shr rcx, LARGE_SHORT_CODE_LEN_OFFSET + jz invalid_symbol + + and %%next_sym_num, LARGE_SYM_COUNT_MASK << LARGE_SYM_COUNT_OFFSET + shr %%next_sym_num, LARGE_SYM_COUNT_OFFSET + + ;; Check if symbol or hint was looked up + and %%next_sym, LARGE_FLAG_BIT | LARGE_SHORT_SYM_MASK + test %%next_sym, LARGE_FLAG_BIT + jz %%end + + shl rcx, LARGE_SYM_COUNT_LEN + or rcx, %%next_sym_num + + ;; Save length associated with symbol + mov %%next_bits, %%read_in + shr %%next_bits, %%lookup_size + + ;; Extract the bits beyond the first %%lookup_size bits. + CLEAR_HIGH_BITS %%next_bits, rcx, %%lookup_size + + and %%next_sym, LARGE_SHORT_SYM_MASK + add %%next_sym, %%next_bits + + ;; Lookup actual next symbol + movzx %%next_sym, word [%%state + LARGE_LONG_CODE_SIZE * %%next_sym + %%state_offset + LARGE_SHORT_CODE_SIZE * (1 << %%lookup_size)] + mov %%next_sym_num, 1 + + ;; Save length associated with symbol + mov rcx, %%next_sym + shr rcx, LARGE_LONG_CODE_LEN_OFFSET + jz invalid_symbol + and %%next_sym, LARGE_LONG_SYM_MASK + +%%end: +;; Updated read_in to reflect the bits which were decoded + SHRX %%read_in, %%read_in, rcx + sub %%read_in_length, rcx +%endm + +;; Decode next symbol +;; Clobber rcx +%macro decode_next_lit_len_with_load 8 +%define %%state %1 ; State structure associated with compressed stream +%define %%lookup_size %2 ; Number of bits used for small lookup +%define %%state_offset %3 +%define %%read_in %4 ; Bits read in from compressed stream +%define %%read_in_length %5 ; Number of valid bits in read_in +%define %%next_sym %6 ; Returned symbols +%define %%next_sym_num %7 ; Returned symbols count +%define %%next_bits %8 + + ;; Lookup possible next symbol + mov %%next_bits, %%read_in + and %%next_bits, (1 << %%lookup_size) - 1 + mov %%next_sym %+ d, dword [%%state + %%state_offset + LARGE_SHORT_CODE_SIZE * %%next_bits] + + decode_next_lit_len %%state, %%lookup_size, %%state_offset, %%read_in, %%read_in_length, %%next_sym, %%next_sym_num, %%next_bits +%endm + +;; Decode next symbol +;; Clobber rcx +%macro decode_next_dist 8 +%define %%state %1 ; State structure associated with compressed stream +%define %%lookup_size %2 ; Number of bits used for small lookup +%define %%state_offset %3 ; Type of huff code, should be either LIT or DIST +%define %%read_in %4 ; Bits read in from compressed stream +%define %%read_in_length %5 ; Number of valid bits in read_in +%define %%next_sym %6 ; Returned symobl +%define %%next_extra_bits %7 +%define %%next_bits %8 + + mov rcx, %%next_sym + shr rcx, SMALL_SHORT_CODE_LEN_OFFSET + jz invalid_dist_symbol_ %+ %%next_sym + + ;; Check if symbol or hint was looked up + and %%next_sym, SMALL_FLAG_BIT | SMALL_SHORT_SYM_MASK + test %%next_sym, SMALL_FLAG_BIT + jz %%end + + ;; Save length associated with symbol + mov %%next_bits, %%read_in + shr %%next_bits, %%lookup_size + + ;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first %%lookup_size bits. + lea %%next_sym, [%%state + SMALL_LONG_CODE_SIZE * %%next_sym] + + CLEAR_HIGH_BITS %%next_bits, rcx, %%lookup_size + + ;; Lookup actual next symbol + movzx %%next_sym, word [%%next_sym + %%state_offset + SMALL_LONG_CODE_SIZE * %%next_bits + SMALL_SHORT_CODE_SIZE * (1 << %%lookup_size) - SMALL_LONG_CODE_SIZE * SMALL_FLAG_BIT] + + ;; Save length associated with symbol + mov rcx, %%next_sym + shr rcx, SMALL_LONG_CODE_LEN_OFFSET + jz invalid_dist_symbol_ %+ %%next_sym + and %%next_sym, SMALL_SHORT_SYM_MASK + +%%end: + ;; Updated read_in to reflect the bits which were decoded + SHRX %%read_in, %%read_in, rcx + sub %%read_in_length, rcx + mov rcx, %%next_sym + shr rcx, DIST_SYM_EXTRA_OFFSET + and %%next_sym, DIST_SYM_MASK +%endm + +;; Decode next symbol +;; Clobber rcx +%macro decode_next_dist_with_load 8 +%define %%state %1 ; State structure associated with compressed stream +%define %%lookup_size %2 ; Number of bits used for small lookup +%define %%state_offset %3 +%define %%read_in %4 ; Bits read in from compressed stream +%define %%read_in_length %5 ; Number of valid bits in read_in +%define %%next_sym %6 ; Returned symobl +%define %%next_extra_bits %7 +%define %%next_bits %8 + + ;; Lookup possible next symbol + mov %%next_bits, %%read_in + and %%next_bits, (1 << %%lookup_size) - 1 + movzx %%next_sym, word [%%state + %%state_offset + SMALL_SHORT_CODE_SIZE * %%next_bits] + + decode_next_dist %%state, %%lookup_size, %%state_offset, %%read_in, %%read_in_length, %%next_sym, %%next_extra_bits, %%next_bits +%endm + +global decode_huffman_code_block_stateless_ %+ ARCH +decode_huffman_code_block_stateless_ %+ ARCH %+ : + + FUNC_SAVE + + mov state, arg0 + mov [rsp + start_out_mem_offset], arg1 + lea rfc_lookup, [rfc1951_lookup_table] + + mov read_in,[state + _read_in] + mov read_in_length %+ d, dword [state + _read_in_length] + mov next_out, [state + _next_out] + mov end_out %+ d, dword [state + _avail_out] + add end_out, next_out + mov next_in, [state + _next_in] + mov end_in %+ d, dword [state + _avail_in] + add end_in, next_in + + mov dword [state + _copy_overflow_len], 0 + mov dword [state + _copy_overflow_dist], 0 + + sub end_out, OUT_BUFFER_SLOP + sub end_in, IN_BUFFER_SLOP + + cmp next_in, end_in + jg end_loop_block_pre + + cmp read_in_length, 64 + je skip_load + + inflate_in_load next_in, end_in, read_in, read_in_length, tmp1, tmp2 + +skip_load: + mov tmp3, read_in + and tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1 + mov next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * tmp3] + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Main Loop +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +loop_block: + ;; Check if near end of in buffer or out buffer + cmp next_in, end_in + jg end_loop_block_pre + cmp next_out, end_out + jg end_loop_block_pre + + ;; Decode next symbol and reload the read_in buffer + decode_next_lit_len state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, next_sym_num, tmp1 + + ;; Specutively write next_sym if it is a literal + mov [next_out], next_sym + add next_out, next_sym_num + lea next_sym2, [8 * next_sym_num - 8] + SHRX next_sym2, next_sym, next_sym2 + + ;; Find index to specutively preload next_sym from + mov tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1 + and tmp3, read_in + + ;; Start reloading read_in + mov tmp1, [next_in] + SHLX tmp1, tmp1, read_in_length + or read_in, tmp1 + + ;; Specutively load data associated with length symbol + lea repeat_length, [next_sym2 - 254] + + ;; Test for end of block symbol + cmp next_sym2, 256 + je end_symbol_pre + + ;; Specutively load next_sym for next loop if a literal was decoded + mov next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * tmp3] + + ;; Finish updating read_in_length for read_in + mov tmp1, 64 + sub tmp1, read_in_length + shr tmp1, 3 + add next_in, tmp1 + lea read_in_length, [read_in_length + 8 * tmp1] + + ;; Specultively load next dist code + mov next_bits2, (1 << ISAL_DECODE_SHORT_BITS) - 1 + and next_bits2, read_in + movzx next_sym3, word [state + _dist_huff_code + SMALL_SHORT_CODE_SIZE * next_bits2] + + ;; Check if next_sym2 is a literal, length, or end of block symbol + cmp next_sym2, 256 + jl loop_block + +decode_len_dist: + ;; Determine next_out after the copy is finished + lea next_out, [next_out + repeat_length - 1] + + ;; Decode distance code + decode_next_dist state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym3, rcx, tmp2 + + mov look_back_dist2 %+ d, [rfc_lookup + _dist_start + 4 * next_sym3] + + ; ;; Load distance code extra bits + mov next_bits, read_in + + ;; Calculate the look back distance + BZHI next_bits, next_bits, rcx, tmp4 + SHRX read_in, read_in, rcx + + ;; Setup next_sym, read_in, and read_in_length for next loop + mov read_in_2, (1 << ISAL_DECODE_LONG_BITS) - 1 + and read_in_2, read_in + mov next_sym %+ d, dword [state + _lit_huff_code + LARGE_SHORT_CODE_SIZE * read_in_2] + sub read_in_length, rcx + + ;; Copy distance in len/dist pair + add look_back_dist2, next_bits + + ;; Find beginning of copy + mov copy_start, next_out + sub copy_start, repeat_length + sub copy_start, look_back_dist2 + + ;; Check if a valid look back distances was decoded + cmp copy_start, [rsp + start_out_mem_offset] + jl invalid_look_back_distance + MOVDQU xmm1, [copy_start] + + ;; Set tmp2 to be the minimum of COPY_SIZE and repeat_length + ;; This is to decrease use of small_byte_copy branch + mov tmp2, COPY_SIZE + cmp tmp2, repeat_length + cmovg tmp2, repeat_length + + ;; Check for overlapping memory in the copy + cmp look_back_dist2, tmp2 + jl small_byte_copy_pre + +large_byte_copy: + ;; Copy length distance pair when memory overlap is not an issue + MOVDQU [copy_start + look_back_dist2], xmm1 + + sub repeat_length, COPY_SIZE + jle loop_block + + add copy_start, COPY_SIZE + MOVDQU xmm1, [copy_start] + jmp large_byte_copy + +small_byte_copy_pre: + ;; Copy length distance pair when source and destination overlap + add repeat_length, look_back_dist2 +small_byte_copy: + MOVDQU [copy_start + look_back_dist2], xmm1 + + shl look_back_dist2, 1 + MOVDQU xmm1, [copy_start] + cmp look_back_dist2, COPY_SIZE + jl small_byte_copy + + sub repeat_length, look_back_dist2 + jge large_byte_copy + jmp loop_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Finish Main Loop +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +end_loop_block_pre: + ;; Fix up in buffer and out buffer to reflect the actual buffer end + add end_out, OUT_BUFFER_SLOP + add end_in, IN_BUFFER_SLOP + +end_loop_block: + ;; Load read in buffer and decode next lit/len symbol + inflate_in_small_load next_in, end_in, read_in, read_in_length, tmp1, tmp2 + mov [rsp + read_in_mem_offset], read_in + mov [rsp + read_in_length_mem_offset], read_in_length + mov [rsp + next_out_mem_offset], next_out + + decode_next_lit_len_with_load state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, next_sym_num, tmp1 + + ;; Check that enough input was available to decode symbol + cmp read_in_length, 0 + jl end_of_input + +multi_symbol_start: + cmp next_sym_num, 1 + jg decode_literal + + cmp next_sym, 256 + jl decode_literal + je end_symbol + +decode_len_dist_2: + lea repeat_length, [next_sym - 254] + ;; Decode distance code + decode_next_dist_with_load state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym, rcx, tmp1 + + ;; Load distance code extra bits + mov next_bits, read_in + mov look_back_dist %+ d, [rfc_lookup + _dist_start + 4 * next_sym] + + ;; Calculate the look back distance and check for enough input + BZHI next_bits, next_bits, rcx, tmp1 + SHRX read_in, read_in, rcx + add look_back_dist, next_bits + sub read_in_length, rcx + jl end_of_input + + ;; Setup code for byte copy using rep movsb + mov rsi, next_out + mov rdi, rsi + mov rcx, repeat_length + sub rsi, look_back_dist + + ;; Check if a valid look back distance was decoded + cmp rsi, [rsp + start_out_mem_offset] + jl invalid_look_back_distance + + ;; Check for out buffer overflow + add repeat_length, next_out + cmp repeat_length, end_out + jg out_buffer_overflow_repeat + + mov next_out, repeat_length + + rep movsb + jmp end_loop_block + +decode_literal: + ;; Store literal decoded from the input stream + cmp next_out, end_out + jge out_buffer_overflow_lit + add next_out, 1 + mov byte [next_out - 1], next_sym %+ b + sub next_sym_num, 1 + jz end_loop_block + shr next_sym, 8 + jmp multi_symbol_start + +;; Set exit codes +end_of_input: + mov read_in, [rsp + read_in_mem_offset] + mov read_in_length, [rsp + read_in_length_mem_offset] + mov next_out, [rsp + next_out_mem_offset] + xor tmp1, tmp1 + mov dword [state + _write_overflow_lits], tmp1 %+ d + mov dword [state + _write_overflow_len], tmp1 %+ d + mov rax, END_INPUT + jmp end + +out_buffer_overflow_repeat: + mov rcx, end_out + sub rcx, next_out + sub repeat_length, rcx + sub repeat_length, next_out + rep movsb + + mov [state + _copy_overflow_len], repeat_length %+ d + mov [state + _copy_overflow_dist], look_back_dist %+ d + + mov next_out, end_out + + mov rax, OUT_OVERFLOW + jmp end + +out_buffer_overflow_lit: + mov dword [state + _write_overflow_lits], next_sym %+ d + mov dword [state + _write_overflow_len], next_sym_num %+ d + sub next_sym_num, 1 + shl next_sym_num, 3 + SHRX next_sym, next_sym, next_sym_num + mov rax, OUT_OVERFLOW + shr next_sym_num, 3 + cmp next_sym, 256 + jl end + mov dword [state + _write_overflow_len], next_sym_num %+ d + jg decode_len_dist_2 + jmp end_state + +invalid_look_back_distance: + mov rax, INVALID_LOOKBACK + jmp end + +invalid_dist_symbol_ %+ next_sym: + cmp read_in_length, next_sym + jl end_of_input + jmp invalid_symbol +invalid_dist_symbol_ %+ next_sym3: + cmp read_in_length, next_sym3 + jl end_of_input +invalid_symbol: + mov rax, INVALID_SYMBOL + jmp end + +end_symbol_pre: + ;; Fix up in buffer and out buffer to reflect the actual buffer + sub next_out, 1 + add end_out, OUT_BUFFER_SLOP + add end_in, IN_BUFFER_SLOP +end_symbol: + xor rax, rax +end_state: + ;; Set flag identifying a new block is required + mov byte [state + _block_state], ISAL_BLOCK_NEW_HDR + cmp dword [state + _bfinal], 0 + je end + mov byte [state + _block_state], ISAL_BLOCK_INPUT_DONE + +end: + ;; Save current buffer states + mov [state + _read_in], read_in + mov [state + _read_in_length], read_in_length %+ d + + ;; Set avail_out + sub end_out, next_out + mov dword [state + _avail_out], end_out %+ d + + ;; Set total_out + mov tmp1, next_out + sub tmp1, [state + _next_out] + add [state + _total_out], tmp1 %+ d + + ;; Set next_out + mov [state + _next_out], next_out + + ;; Set next_in + mov [state + _next_in], next_in + + ;; Set avail_in + sub end_in, next_in + mov [state + _avail_in], end_in %+ d + + FUNC_RESTORE + + ret |