diff options
Diffstat (limited to 'src/isa-l/igzip/igzip_update_histogram.asm')
-rw-r--r-- | src/isa-l/igzip/igzip_update_histogram.asm | 557 |
1 files changed, 557 insertions, 0 deletions
diff --git a/src/isa-l/igzip/igzip_update_histogram.asm b/src/isa-l/igzip/igzip_update_histogram.asm new file mode 100644 index 00000000..4d91f7fb --- /dev/null +++ b/src/isa-l/igzip/igzip_update_histogram.asm @@ -0,0 +1,557 @@ + +%include "options.asm" + +%include "lz0a_const.asm" +%include "data_struct2.asm" +%include "bitbuf2.asm" +%include "huffman.asm" +%include "igzip_compare_types.asm" +%include "reg_sizes.asm" + +%include "stdmac.asm" + +extern rfc1951_lookup_table +_len_to_code_offset equ 0 + +%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds +%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary +%define LIT_LEN 286 +%define DIST_LEN 30 +%define HIST_ELEM_SIZE 8 + +%ifdef DEBUG +%macro MARK 1 +global %1 +%1: +%endm +%else +%macro MARK 1 +%endm +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define file_start rdi +%define file_length rsi +%define histogram rdx +%define rfc_lookup r9 +%define f_i r10 + +%define curr_data rax + +%define tmp2 rcx + +%define dist rbx +%define dist_code2 rbx + +%define dist2 r12 +%define dist_code r12 + +%define len rbp +%define len_code rbp +%define hash3 rbp + +%define curr_data2 r8 +%define len2 r8 +%define tmp4 r8 + +%define tmp1 r11 + +%define tmp3 r13 + +%define hash r14 + +%define hash2 r15 + +%define xtmp0 xmm0 +%define xtmp1 xmm1 +%define xdata xmm2 + +%define ytmp0 ymm0 +%define ytmp1 ymm1 + +%if(ARCH == 01) +%define vtmp0 xtmp0 +%define vtmp1 xtmp1 +%define V_LENGTH 16 +%else +%define vtmp0 ytmp0 +%define vtmp1 ytmp1 +%define V_LENGTH 32 +%endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +_eob_count_offset equ 0 ; local variable (8 bytes) +f_end_i_mem_offset equ 8 +gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes) +xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) +stack_size equ 2*8 + 8*8 + 4*16 + 8 +;;; 8 because stack address is odd multiple of 8 after a function call and +;;; we want it aligned to 16 bytes + +%ifidn __OUTPUT_FORMAT__, elf64 +%define arg0 rdi +%define arg1 rsi +%define arg2 rdx + +%macro FUNC_SAVE 0 +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, stack_size + and rsp, ~15 +%else + sub rsp, stack_size +%endif + + mov [rsp + gpr_save_mem_offset + 0*8], rbx + mov [rsp + gpr_save_mem_offset + 1*8], rbp + mov [rsp + gpr_save_mem_offset + 2*8], r12 + mov [rsp + gpr_save_mem_offset + 3*8], r13 + mov [rsp + gpr_save_mem_offset + 4*8], r14 + mov [rsp + gpr_save_mem_offset + 5*8], r15 +%endm + +%macro FUNC_RESTORE 0 + mov rbx, [rsp + gpr_save_mem_offset + 0*8] + mov rbp, [rsp + gpr_save_mem_offset + 1*8] + mov r12, [rsp + gpr_save_mem_offset + 2*8] + mov r13, [rsp + gpr_save_mem_offset + 3*8] + mov r14, [rsp + gpr_save_mem_offset + 4*8] + mov r15, [rsp + gpr_save_mem_offset + 5*8] + +%ifndef ALIGN_STACK + add rsp, stack_size +%else + mov rsp, rbp + pop rbp +%endif +%endm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define arg0 rcx +%define arg1 rdx +%define arg2 r8 + +%macro FUNC_SAVE 0 +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, stack_size + and rsp, ~15 +%else + sub rsp, stack_size +%endif + + mov [rsp + gpr_save_mem_offset + 0*8], rbx + mov [rsp + gpr_save_mem_offset + 1*8], rsi + mov [rsp + gpr_save_mem_offset + 2*8], rdi + mov [rsp + gpr_save_mem_offset + 3*8], rbp + mov [rsp + gpr_save_mem_offset + 4*8], r12 + mov [rsp + gpr_save_mem_offset + 5*8], r13 + mov [rsp + gpr_save_mem_offset + 6*8], r14 + mov [rsp + gpr_save_mem_offset + 7*8], r15 +%endm + +%macro FUNC_RESTORE 0 + mov rbx, [rsp + gpr_save_mem_offset + 0*8] + mov rsi, [rsp + gpr_save_mem_offset + 1*8] + mov rdi, [rsp + gpr_save_mem_offset + 2*8] + mov rbp, [rsp + gpr_save_mem_offset + 3*8] + mov r12, [rsp + gpr_save_mem_offset + 4*8] + mov r13, [rsp + gpr_save_mem_offset + 5*8] + mov r14, [rsp + gpr_save_mem_offset + 6*8] + mov r15, [rsp + gpr_save_mem_offset + 7*8] + +%ifndef ALIGN_STACK + add rsp, stack_size +%else + mov rsp, rbp + pop rbp +%endif +%endm +%endif + + +_lit_len_offset equ 0 +_dist_offset equ (8 * LIT_LEN) +_hash_offset equ (_dist_offset + 8 * DIST_LEN) + + +%macro len_to_len_code 3 +%define %%len_code %1 ; Output +%define %%len %2 ; Input +%define %%rfc_lookup %3 + movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len] + or %%len_code, 0x100 +%endm + +;;; Clobbers rcx and dist +%macro dist_to_dist_code 2 +%define %%dist_code %1 ; Output code associated with dist +%define %%dist_coded %1d +%define %%dist %2d ; Input dist + dec %%dist + mov %%dist_coded, %%dist + bsr ecx, %%dist_coded + dec ecx + SHRX %%dist_code, %%dist_code, rcx + lea %%dist_coded, [%%dist_coded + 2*ecx] + + cmp %%dist, 1 + cmovle %%dist_coded, %%dist +%endm + +;;; Clobbers rcx and dist +%macro dist_to_dist_code2 2 +%define %%dist_code %1 ; Output code associated with dist +%define %%dist_coded %1d +%define %%dist %2d ; Input -(dist - 1) + neg %%dist + mov %%dist_coded, %%dist + bsr ecx, %%dist_coded + dec ecx + SHRX %%dist_code, %%dist_code, rcx + lea %%dist_coded, [%%dist_coded + 2*ecx] + + cmp %%dist, 1 + cmovle %%dist_coded, %%dist +%endm + +; void isal_update_histogram +global isal_update_histogram_ %+ ARCH +isal_update_histogram_ %+ ARCH %+ : + FUNC_SAVE + +%ifnidn file_start, arg0 + mov file_start, arg0 +%endif +%ifnidn file_length, arg1 + mov file_length, arg1 +%endif +%ifnidn histogram, arg2 + mov histogram, arg2 +%endif + mov f_i, 0 + cmp file_length, 0 + je exit_ret ; If nothing to do then exit + + mov tmp1, qword [histogram + _lit_len_offset + 8*256] + inc tmp1 + mov [rsp + _eob_count_offset], tmp1 + + lea rfc_lookup, [rfc1951_lookup_table] + + ;; Init hash_table + PXOR vtmp0, vtmp0, vtmp0 + mov rcx, (IGZIP_HASH_SIZE - V_LENGTH) +init_hash_table: + MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0 + MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0 + sub rcx, V_LENGTH + jge init_hash_table + + sub file_length, LA_STATELESS + cmp file_length, 0 + jle end_loop_2 + + + ;; Load first literal into histogram + mov curr_data, [file_start + f_i] + compute_hash hash, curr_data + and hash %+ d, HASH_MASK + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + and curr_data, 0xff + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + inc f_i + + ;; Setup to begin loop 2 + MOVDQU xdata, [file_start + f_i] + mov curr_data, [file_start + f_i] + mov curr_data2, curr_data + compute_hash hash, curr_data + shr curr_data2, 8 + compute_hash hash2, curr_data2 + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK +loop2: + xor dist, dist + xor dist2, dist2 + xor tmp3, tmp3 + + lea tmp1, [file_start + f_i] + + MOVQ curr_data, xdata + PSRLDQ xdata, 1 + + ;; Load possible look back distances and update hash data + mov dist %+ w, f_i %+ w + sub dist, 1 + sub dist %+ w, word [histogram + _hash_offset + 2 * hash] + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + + add f_i, 1 + + mov dist2 %+ w, f_i %+ w + sub dist2, 1 + sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2] + mov [histogram + _hash_offset + 2 * hash2], f_i %+ w + + ;; Start computing hashes to be used in either the next loop or + ;; for updating the hash if a match is found + MOVQ curr_data2, xdata + MOVQ tmp2, xdata + shr curr_data2, 8 + compute_hash hash, curr_data2 + + ;; Check if look back distances are valid. Load a junk distance of 1 + ;; if the look back distance is too long for speculative lookups. + and dist %+ d, (D-1) + neg dist + + and dist2 %+ d, (D-1) + neg dist2 + + shr tmp2, 16 + compute_hash hash2, tmp2 + + ;; Check for long len/dist matches (>7) + mov len, curr_data + xor len, [tmp1 + dist - 1] + jz compare_loop + + and hash %+ d, HASH_MASK + and hash2 %+ d, HASH_MASK + + MOVQ len2, xdata + xor len2, [tmp1 + dist2] + jz compare_loop2 + + ;; Specutively load the code for the first literal + movzx tmp1, curr_data %+ b + shr curr_data, 8 + + lea tmp3, [f_i + 1] + + ;; Check for len/dist match for first literal + test len %+ d, 0xFFFFFFFF + jz len_dist_huffman_pre + + ;; Store first literal + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1] + + ;; Check for len/dist match for second literal + test len2 %+ d, 0xFFFFFFFF + jnz lit_lit_huffman +len_dist_lit_huffman_pre: + ;; Calculate repeat length + tzcnt len2, len2 + shr len2, 3 + +len_dist_lit_huffman: + MOVQ curr_data, xdata + shr curr_data, 24 + compute_hash hash3, curr_data + + ;; Store updated hashes + mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w + add tmp3,1 + mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w + add tmp3, 1 + + add f_i, len2 + + MOVDQU xdata, [file_start + f_i] + mov curr_data, [file_start + f_i] + mov tmp1, curr_data + compute_hash hash, curr_data + + and hash3, HASH_MASK + mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w + + dist_to_dist_code2 dist_code2, dist2 + + len_to_len_code len_code, len2, rfc_lookup + + shr tmp1, 8 + compute_hash hash2, tmp1 + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2] + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK + + cmp f_i, file_length + jl loop2 + jmp end_loop_2 + ;; encode as dist/len + +len_dist_huffman_pre: + tzcnt len, len + shr len, 3 + +len_dist_huffman: + mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w + add tmp3,1 + mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w + + dec f_i + add f_i, len + + MOVDQU xdata, [file_start + f_i] + mov curr_data, [file_start + f_i] + mov tmp1, curr_data + compute_hash hash, curr_data + + dist_to_dist_code2 dist_code, dist + + len_to_len_code len_code, len, rfc_lookup + + shr tmp1, 8 + compute_hash hash2, tmp1 + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] + + and hash2 %+ d, HASH_MASK + and hash, HASH_MASK + + cmp f_i, file_length + jl loop2 + jmp end_loop_2 + +lit_lit_huffman: + MOVDQU xdata, [file_start + f_i + 1] + and curr_data, 0xff + add f_i, 1 + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + + cmp f_i, file_length + jl loop2 + +end_loop_2: + add file_length, LA_STATELESS - LAST_BYTES_COUNT + cmp f_i, file_length + jge final_bytes + +loop2_finish: + mov curr_data %+ d, dword [file_start + f_i] + compute_hash hash, curr_data + and hash %+ d, HASH_MASK + + ;; Calculate possible distance for length/dist pair. + xor dist, dist + mov dist %+ w, f_i %+ w + sub dist %+ w, word [histogram + _hash_offset + 2 * hash] + mov [histogram + _hash_offset + 2 * hash], f_i %+ w + + ;; Check if look back distance is valid (the dec is to handle when dist = 0) + dec dist + cmp dist %+ d, (D-1) + jae encode_literal_finish + inc dist + + ;; Check if look back distance is a match + lea tmp4, [file_length + LAST_BYTES_COUNT] + sub tmp4, f_i + lea tmp1, [file_start + f_i] + mov tmp2, tmp1 + sub tmp2, dist + compare tmp4, tmp1, tmp2, len, tmp3 + + ;; Limit len to maximum value of 258 + mov tmp2, 258 + cmp len, 258 + cmova len, tmp2 + cmp len, SHORTEST_MATCH + jb encode_literal_finish + + add f_i, len + + len_to_len_code len_code, len, rfc_lookup + dist_to_dist_code dist_code, dist + + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code] + inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code] + + cmp f_i, file_length + jl loop2_finish + jmp final_bytes + +encode_literal_finish: + ;; Encode literal + and curr_data %+ d, 0xFF + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + + ;; Setup for next loop + add f_i, 1 + cmp f_i, file_length + jl loop2_finish + +final_bytes: + add file_length, LAST_BYTES_COUNT +final_bytes_loop: + cmp f_i, file_length + jge end + movzx curr_data, byte [file_start + f_i] + inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data] + inc f_i + jmp final_bytes_loop + +end: + ;; Handle eob at end of stream + mov tmp1, [rsp + _eob_count_offset] + mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1 + +exit_ret: + FUNC_RESTORE + ret + +compare_loop: + and hash %+ d, HASH_MASK + and hash2 %+ d, HASH_MASK + lea tmp2, [tmp1 + dist - 1] +%if (COMPARE_TYPE == 1) + compare250 tmp1, tmp2, len, tmp3 +%elif (COMPARE_TYPE == 2) + compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1 +%elif (COMPARE_TYPE == 3) + compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1 +%else + %error Unknown Compare type COMPARE_TYPE + % error +%endif + lea tmp3, [f_i + 1] + jmp len_dist_huffman + +compare_loop2: + add tmp1, 1 + lea tmp2, [tmp1 + dist2 - 1] + +%if (COMPARE_TYPE == 1) + compare250 tmp1, tmp2, len2, tmp3 +%elif (COMPARE_TYPE == 2) + compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1 +%elif (COMPARE_TYPE == 3) + compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1 +%else +%error Unknown Compare type COMPARE_TYPE + % error +%endif + and curr_data, 0xff + inc qword [histogram + _lit_len_offset + 8 * curr_data] + lea tmp3, [f_i + 1] + jmp len_dist_lit_huffman + +section .data + align 32 +D_vector: + dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF + dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF + dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF + dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF |