summaryrefslogtreecommitdiffstats
path: root/src/isa-l/igzip/igzip_finish.asm
blob: 2b539dd454892399e4ff7ccc9fbdb4c8fe5d416b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%include "options.asm"
%include "lz0a_const.asm"
%include "data_struct2.asm"
%include "bitbuf2.asm"
%include "huffman.asm"
%include "igzip_compare_types.asm"

%include "stdmac.asm"
%include "reg_sizes.asm"

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%define curr_data	rax
%define tmp1		rax

%define f_index		rbx
%define code		rbx
%define tmp4		rbx
%define tmp5		rbx
%define tmp6		rbx

%define tmp2		rcx
%define hash		rcx

%define tmp3		rdx

%define stream		rsi

%define f_i		rdi

%define code_len2	rbp
%define hmask1		rbp

%define m_out_buf	r8

%define m_bits		r9

%define dist		r10
%define hmask2		r10

%define m_bit_count	r11

%define code2		r12
%define f_end_i		r12

%define file_start	r13

%define len		r14

%define hufftables	r15

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
f_end_i_mem_offset	equ 0    ; local variable (8 bytes)
stack_size		equ 8

[bits 64]
default rel
section .text

; void isal_deflate_finish ( isal_zstream *stream )
; arg 1: rcx: addr of stream
global isal_deflate_finish_01
isal_deflate_finish_01:
	endbranch
	PUSH_ALL	rbx, rsi, rdi, rbp, r12, r13, r14, r15
	sub	rsp, stack_size

%ifidn __OUTPUT_FORMAT__, elf64
	mov	rcx, rdi
%endif

	mov	stream, rcx

	; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
	mov	m_out_buf, [stream + _next_out]
	mov	[stream + _internal_state_bitbuf_m_out_start], m_out_buf
	mov	tmp1 %+ d, [stream + _avail_out]
	add	tmp1, m_out_buf
	sub	tmp1, SLOP
skip_SLOP:
	mov	[stream + _internal_state_bitbuf_m_out_end], tmp1

	mov	m_bits,           [stream + _internal_state_bitbuf_m_bits]
	mov	m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]

	mov	hufftables, [stream + _hufftables]

	mov	file_start, [stream + _next_in]

	mov	f_i %+ d, dword [stream + _total_in]
	sub	file_start, f_i

	mov	f_end_i %+ d, dword [stream + _avail_in]
	add	f_end_i, f_i

	sub	f_end_i, LAST_BYTES_COUNT
	mov	[rsp + f_end_i_mem_offset], f_end_i
	; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
	cmp	f_i, f_end_i
	jge	end_loop_2

	mov	curr_data %+ d, [file_start + f_i]

	cmp	byte [stream + _internal_state_has_hist], IGZIP_NO_HIST
	jne	skip_write_first_byte

	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
	ja	end_loop_2
	mov	hmask1 %+ d, dword [stream + _internal_state_hash_mask]
	compute_hash	hash, curr_data
	and	hash %+ d, hmask1 %+ d
	mov	[stream + _internal_state_head + 2 * hash], f_i %+ w
	mov	byte [stream + _internal_state_has_hist], IGZIP_HIST
	jmp	encode_literal

skip_write_first_byte:

loop2:
	mov     tmp3 %+ d, dword [stream + _internal_state_dist_mask]
	mov	hmask1 %+ d,  dword [stream + _internal_state_hash_mask]
	; if (state->bitbuf.is_full()) {
	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
	ja	end_loop_2

	; hash = compute_hash(state->file_start + f_i) & hash_mask;
	mov	curr_data %+ d, [file_start + f_i]
	compute_hash	hash, curr_data
	and	hash %+ d, hmask1 %+ d

	; f_index = state->head[hash];
	movzx	f_index %+ d, word [stream + _internal_state_head + 2 * hash]

	; state->head[hash] = (uint16_t) f_i;
	mov	[stream + _internal_state_head + 2 * hash], f_i %+ w

	; dist = f_i - f_index; // mod 64k
	mov	dist %+ d, f_i %+ d
	sub	dist %+ d, f_index %+ d
	and	dist %+ d, 0xFFFF

	; if ((dist-1) <= (D-1)) {
	mov	tmp1 %+ d, dist %+ d
	sub	tmp1 %+ d, 1
	cmp	tmp1 %+ d, tmp3 %+ d
	jae	encode_literal

	; len = f_end_i - f_i;
	mov	tmp4, [rsp + f_end_i_mem_offset]
	sub	tmp4, f_i
	add	tmp4, LAST_BYTES_COUNT

	; if (len > 258) len = 258;
	cmp	tmp4, 258
	cmovg	tmp4, [c258]

	; len = compare(state->file_start + f_i,
	;               state->file_start + f_i - dist, len);
	lea	tmp1, [file_start + f_i]
	mov	tmp2, tmp1
	sub	tmp2, dist
	compare	tmp4, tmp1, tmp2, len, tmp3

	; if (len >= SHORTEST_MATCH) {
	cmp	len, SHORTEST_MATCH
	jb	encode_literal

	;; encode as dist/len

	; get_dist_code(dist, &code2, &code_len2);
	dec	dist
	get_dist_code	dist, code2, code_len2, hufftables ;; clobbers dist, rcx

	; get_len_code(len, &code, &code_len);
	get_len_code	len, code, rcx, hufftables	;; rcx is code_len

	mov	hmask2 %+ d,  dword [stream + _internal_state_hash_mask]
	; code2 <<= code_len
	; code2 |= code
	; code_len2 += code_len
	SHLX	code2, code2, rcx
	or	code2, code
	add	code_len2, rcx

	; for (k = f_i+1, f_i += len-1; k <= f_i; k++) {
	lea	tmp3, [f_i + 1]	; tmp3 <= k
	add	f_i, len
	cmp	f_i, [rsp + f_end_i_mem_offset]
	jae	skip_hash_update

	; only update hash twice

	; hash = compute_hash(state->file_start + k) & hash_mask;
	mov	tmp6 %+ d, dword [file_start + tmp3]
	compute_hash	hash, tmp6
	and	hash %+ d, hmask2 %+ d
	; state->head[hash] = k;
	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w

	add	tmp3, 1

	; hash = compute_hash(state->file_start + k) & hash_mask;
	mov	tmp6 %+ d, dword [file_start + tmp3]
	compute_hash	hash, tmp6
	and	hash %+ d, hmask2 %+ d
	; state->head[hash] = k;
	mov	[stream + _internal_state_head + 2 * hash], tmp3 %+ w

skip_hash_update:
	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

	; continue
	cmp	f_i, [rsp + f_end_i_mem_offset]
	jl	loop2
	jmp	end_loop_2

encode_literal:
	; get_lit_code(state->file_start[f_i], &code2, &code_len2);
	movzx	tmp5, byte [file_start + f_i]
	get_lit_code	tmp5, code2, code_len2, hufftables

	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

	; continue
	add	f_i, 1
	cmp	f_i, [rsp + f_end_i_mem_offset]
	jl	loop2

end_loop_2:
	mov	f_end_i, [rsp + f_end_i_mem_offset]
	add	f_end_i, LAST_BYTES_COUNT
	mov	[rsp + f_end_i_mem_offset], f_end_i
	; if ((f_i >= f_end_i) && ! state->bitbuf.is_full()) {
	cmp	f_i, f_end_i
	jge	write_eob

	xor	tmp5, tmp5
final_bytes:
	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
	ja	not_end
	movzx	tmp5, byte [file_start + f_i]
	get_lit_code	tmp5, code2, code_len2, hufftables
	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

	inc	f_i
	cmp	f_i, [rsp + f_end_i_mem_offset]
	jl	final_bytes

write_eob:
	cmp	m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
	ja	not_end

	;	get_lit_code(256, &code2, &code_len2);
	get_lit_code	256, code2, code_len2, hufftables

	write_bits	m_bits, m_bit_count, code2, code_len2, m_out_buf

	mov	byte [stream + _internal_state_has_eob], 1
	cmp	word [stream + _end_of_stream], 1
	jne	sync_flush
	;	   state->state = ZSTATE_TRL;
	mov	dword [stream + _internal_state_state], ZSTATE_TRL
	jmp	not_end

sync_flush:
	;	   state->state = ZSTATE_SYNC_FLUSH;
	mov	dword [stream + _internal_state_state], ZSTATE_SYNC_FLUSH
	;    }
not_end:


	;; Update input buffer
	mov	f_end_i, [rsp + f_end_i_mem_offset]
	mov	[stream + _total_in], f_i %+ d
	add	file_start, f_i
	mov	[stream + _next_in], file_start
	sub	f_end_i, f_i
	mov	[stream + _avail_in], f_end_i %+ d

	;; Update output buffer
	mov	[stream + _next_out], m_out_buf
	;    len = state->bitbuf.buffer_used();
	sub	m_out_buf, [stream + _internal_state_bitbuf_m_out_start]

	;    stream->avail_out -= len;
	sub	[stream + _avail_out], m_out_buf %+ d
	;    stream->total_out += len;
	add	[stream + _total_out], m_out_buf %+ d

	mov	[stream + _internal_state_bitbuf_m_bits], m_bits
	mov	[stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
	add	rsp, stack_size
	POP_ALL
	ret

section .data
	align 4
c258:	dq	258