src/isa-l/raid/xor_gen_avx.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Optimized xor of N source vectors using AVX
;;; int xor_gen_avx(int vects, int len, void **array)

;;; Generates xor parity vector from N (vects-1) sources in array of pointers
;;; (**array).  Last pointer is the dest.
;;; Vectors must be aligned to 32 bytes.  Length can be any value.

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9
 %define tmp   r11
 %define tmp3  arg4
 %define func(x) x:
 %define return rax
 %define FUNC_SAVE
 %define FUNC_RESTORE

%elifidn __OUTPUT_FORMAT__, win64
 %define arg0  rcx
 %define arg1  rdx
 %define arg2  r8
 %define arg3  r9
 %define tmp   r11
 %define tmp3  r10
 %define func(x) proc_frame x
 %define return rax
 %define stack_size  2*32 + 8 	;must be an odd multiple of 8

 %macro FUNC_SAVE 0
	alloc_stack	stack_size
	vmovdqu	[rsp + 0*32], ymm6
	vmovdqu	[rsp + 1*32], ymm7
	end_prolog
 %endmacro
 %macro FUNC_RESTORE 0
	vmovdqu	ymm6, [rsp + 0*32]
	vmovdqu	ymm7, [rsp + 1*32]
	add	rsp, stack_size
 %endmacro

%endif	;output formats


%define vec arg0
%define	len arg1
%define ptr arg3
%define tmp2 rax
%define tmp2.b al
%define pos tmp3
%define PS 8

;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
 %define XLDR vmovdqa
 %define XSTR vmovdqa
%else
 %define XLDR vmovdqa
 %define XSTR vmovntdq
%endif


default rel
[bits 64]

section .text

align 16
global xor_gen_avx:ISAL_SYM_TYPE_FUNCTION
func(xor_gen_avx)

	FUNC_SAVE
	sub	vec, 2			;Keep as offset to last source
	jng	return_fail		;Must have at least 2 sources
	cmp	len, 0
	je	return_pass
	test	len, (128-1)		;Check alignment of length
	jnz	len_not_aligned


len_aligned_128bytes:
	sub	len, 128
	mov	pos, 0

loop128:
	mov	tmp, vec		;Back to last vector
	mov	tmp2, [arg2+vec*PS]	;Fetch last pointer in array
	sub	tmp, 1			;Next vect
	XLDR	ymm0, [tmp2+pos]	;Start with end of array in last vector
	XLDR	ymm1, [tmp2+pos+32]	;Keep xor parity in xmm0-7
	XLDR	ymm2, [tmp2+pos+(2*32)]
	XLDR	ymm3, [tmp2+pos+(3*32)]

next_vect:
	mov 	ptr, [arg2+tmp*PS]
	sub	tmp, 1
	XLDR	ymm4, [ptr+pos]		;Get next vector (source)
	XLDR	ymm5, [ptr+pos+32]
	XLDR	ymm6, [ptr+pos+(2*32)]
	XLDR	ymm7, [ptr+pos+(3*32)]
	vxorpd	ymm0, ymm0, ymm4	;Add to xor parity
	vxorpd	ymm1, ymm1, ymm5
	vxorpd	ymm2, ymm2, ymm6
	vxorpd	ymm3, ymm3, ymm7
	jge	next_vect		;Loop for each source

	mov	ptr, [arg2+PS+vec*PS]	;Address of parity vector
	XSTR	[ptr+pos], ymm0		;Write parity xor vector
	XSTR	[ptr+pos+(1*32)], ymm1
	XSTR	[ptr+pos+(2*32)], ymm2
	XSTR	[ptr+pos+(3*32)], ymm3
	add	pos, 128
	cmp	pos, len
	jle	loop128

return_pass:
	FUNC_RESTORE
	mov	return, 0
	ret


;;; Do one byte at a time for no alignment case
loop_1byte:
	mov	tmp, vec		;Back to last vector
	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
	mov	tmp2.b, [ptr+len-1]	;Get array n
	sub	tmp, 1
nextvect_1byte:
	mov 	ptr, [arg2+tmp*PS]
	xor	tmp2.b, [ptr+len-1]
	sub	tmp, 1
	jge	nextvect_1byte

	mov	tmp, vec
	add	tmp, 1		  	;Add back to point to last vec
	mov	ptr, [arg2+tmp*PS]
	mov	[ptr+len-1], tmp2.b 	;Write parity
	sub	len, 1
	test	len, (PS-1)
	jnz	loop_1byte

	cmp	len, 0
	je	return_pass
	test	len, (128-1)		;If not 0 and 128bit aligned
	jz	len_aligned_128bytes	; then do aligned case. len = y * 128

	;; else we are 8-byte aligned so fall through to recheck


	;; Unaligned length cases
len_not_aligned:
	test	len, (PS-1)
	jne	loop_1byte
	mov	tmp3, len
	and	tmp3, (128-1)		;Do the unaligned bytes 8 at a time

	;; Run backwards 8 bytes at a time for (tmp3) bytes
loop8_bytes:
	mov	tmp, vec		;Back to last vector
	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
	mov	tmp2, [ptr+len-PS]	;Get array n
	sub	tmp, 1
nextvect_8bytes:
	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
	xor	tmp2, [ptr+len-PS]
	sub	tmp, 1
	jge	nextvect_8bytes		;Loop for each source

	mov	tmp, vec
	add	tmp, 1		  	;Add back to point to last vec
	mov	ptr, [arg2+tmp*PS]
	mov	[ptr+len-PS], tmp2	;Write parity
	sub	len, PS
	sub	tmp3, PS
	jg	loop8_bytes

	cmp	len, 128		;Now len is aligned to 128B
	jge	len_aligned_128bytes	;We can do the rest aligned

	cmp	len, 0
	je	return_pass

return_fail:
	FUNC_RESTORE
	mov	return, 1
	ret

endproc_frame

section .data

;;;       func         core, ver, snum
slversion xor_gen_avx, 02,   05,  0037