1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
%define LIMIT 5552
%define BASE 0xFFF1 ; 65521
%include "reg_sizes.asm"
default rel
[bits 64]
; need to keep free: eax, ecx, edx
%ifidn __OUTPUT_FORMAT__, elf64
%define arg1 rdi
%define arg2 rsi
%define arg3 rdx
%define init_d edi
%define data r9
%define size r10
%define s r11
%define a_d r12d
%define b_d r8d
%define end r13
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg1 rcx
%define arg2 rdx
%define arg3 r8
%define init_d r12d
%define data r9
%define size r10
%define s r11
%define a_d esi
%define b_d edi
%define end r13
%define stack_size 5*8 ; must be an odd multiple of 8
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_reg rdi, 0*8
save_reg rsi, 1*8
save_reg r12, 2*8
save_reg r13, 3*8
end_prolog
mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
%endmacro
%macro FUNC_RESTORE 0
mov rdi, [rsp + 0*8]
mov rsi, [rsp + 1*8]
mov r12, [rsp + 2*8]
mov r13, [rsp + 3*8]
add rsp, stack_size
%endmacro
%endif
%define xa xmm0
%define xb xmm1
%define xdata0 xmm2
%define xdata1 xmm3
%define xsa xmm4
global adler32_sse:ISAL_SYM_TYPE_FUNCTION
func(adler32_sse)
FUNC_SAVE
mov data, arg2
mov size, arg3
mov b_d, init_d
shr b_d, 16
and init_d, 0xFFFF
cmp size, 32
jb .lt64
movd xa, init_d
pxor xb, xb
.sloop1:
mov s, LIMIT
cmp s, size
cmova s, size ; s = min(size, LIMIT)
lea end, [data + s - 7]
cmp data, end
jae .skip_loop_1a
align 32
.sloop1a:
; do 8 adds
pmovzxbd xdata0, [data]
pmovzxbd xdata1, [data + 4]
add data, 8
paddd xa, xdata0
paddd xb, xa
paddd xa, xdata1
paddd xb, xa
cmp data, end
jb .sloop1a
.skip_loop_1a:
add end, 7
test s, 7
jnz .do_final
; either we're done, or we just did LIMIT
sub size, s
; reduce
pslld xb, 2 ; b is scaled by 4
movdqa xsa, xa ; scaled a
pmulld xsa, [A_SCALE]
phaddd xa, xa
phaddd xb, xb
phaddd xsa, xsa
phaddd xa, xa
phaddd xb, xb
phaddd xsa, xsa
movd eax, xa
xor edx, edx
mov ecx, BASE
div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
mov a_d, edx
psubd xb, xsa
movd eax, xb
add eax, b_d
xor edx, edx
mov ecx, BASE
div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
mov b_d, edx
test size, size
jz .finish
; continue loop
movd xa, a_d
pxor xb, xb
jmp .sloop1
.finish:
mov eax, b_d
shl eax, 16
or eax, a_d
jmp .end
.lt64:
mov a_d, init_d
lea end, [data + size]
test size, size
jnz .final_loop
jmp .zero_size
; handle remaining 1...15 bytes
.do_final:
; reduce
pslld xb, 2 ; b is scaled by 4
movdqa xsa, xa ; scaled a
pmulld xsa, [A_SCALE]
phaddd xa, xa
phaddd xb, xb
phaddd xsa, xsa
phaddd xa, xa
phaddd xb, xb
phaddd xsa, xsa
psubd xb, xsa
movd a_d, xa
movd eax, xb
add b_d, eax
align 32
.final_loop:
movzx eax, byte[data]
add a_d, eax
inc data
add b_d, a_d
cmp data, end
jb .final_loop
.zero_size:
mov eax, a_d
xor edx, edx
mov ecx, BASE
div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
mov a_d, edx
mov eax, b_d
xor edx, edx
mov ecx, BASE
div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
shl edx, 16
or edx, a_d
mov eax, edx
.end:
FUNC_RESTORE
ret
endproc_frame
section .data
align 32
A_SCALE:
dq 0x0000000100000000, 0x0000000300000002
|