summaryrefslogtreecommitdiffstats
path: root/src/isa-l/raid
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/raid')
-rw-r--r--src/isa-l/raid/Makefile.am64
-rw-r--r--src/isa-l/raid/pq_check_sse.asm277
-rw-r--r--src/isa-l/raid/pq_check_sse_i32.asm282
-rw-r--r--src/isa-l/raid/pq_check_test.c304
-rw-r--r--src/isa-l/raid/pq_gen_avx.asm254
-rw-r--r--src/isa-l/raid/pq_gen_avx2.asm256
-rw-r--r--src/isa-l/raid/pq_gen_avx512.asm235
-rw-r--r--src/isa-l/raid/pq_gen_perf.c97
-rw-r--r--src/isa-l/raid/pq_gen_sse.asm258
-rw-r--r--src/isa-l/raid/pq_gen_sse_i32.asm264
-rw-r--r--src/isa-l/raid/pq_gen_test.c194
-rw-r--r--src/isa-l/raid/raid_base.c147
-rw-r--r--src/isa-l/raid/raid_base_aliases.c50
-rw-r--r--src/isa-l/raid/raid_multibinary.asm149
-rw-r--r--src/isa-l/raid/raid_multibinary_i32.asm58
-rw-r--r--src/isa-l/raid/xor_check_sse.asm285
-rw-r--r--src/isa-l/raid/xor_check_test.c280
-rw-r--r--src/isa-l/raid/xor_example.c70
-rw-r--r--src/isa-l/raid/xor_gen_avx.asm228
-rw-r--r--src/isa-l/raid/xor_gen_avx512.asm217
-rw-r--r--src/isa-l/raid/xor_gen_perf.c98
-rw-r--r--src/isa-l/raid/xor_gen_sse.asm284
-rw-r--r--src/isa-l/raid/xor_gen_test.c165
23 files changed, 4516 insertions, 0 deletions
diff --git a/src/isa-l/raid/Makefile.am b/src/isa-l/raid/Makefile.am
new file mode 100644
index 00000000..95490e2c
--- /dev/null
+++ b/src/isa-l/raid/Makefile.am
@@ -0,0 +1,64 @@
+########################################################################
+# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += raid/raid_base.c
+
+lsrc_base_aliases += raid/raid_base_aliases.c
+
+lsrc_x86_64 += \
+ raid/xor_gen_sse.asm \
+ raid/pq_gen_sse.asm \
+ raid/xor_check_sse.asm \
+ raid/pq_check_sse.asm \
+ raid/pq_gen_avx.asm \
+ raid/xor_gen_avx.asm \
+ raid/pq_gen_avx2.asm \
+ raid/xor_gen_avx512.asm \
+ raid/pq_gen_avx512.asm \
+ raid/raid_multibinary.asm
+
+lsrc_x86_32 += \
+ raid/xor_gen_sse.asm \
+ raid/pq_gen_sse_i32.asm \
+ raid/xor_check_sse.asm \
+ raid/pq_check_sse_i32.asm \
+ raid/raid_multibinary_i32.asm
+
+
+extern_hdrs += include/raid.h
+
+other_src += include/test.h include/types.h
+
+check_tests += raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test
+
+perf_tests += raid/xor_gen_perf raid/pq_gen_perf
+
+examples += raid/xor_example
+
+lsrc32 += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c
diff --git a/src/isa-l/raid/pq_check_sse.asm b/src/isa-l/raid/pq_check_sse.asm
new file mode 100644
index 00000000..96a8177a
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse.asm
@@ -0,0 +1,277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_check_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 7*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm15, 6*16
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ movdqa xpoly, [poly]
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Do end of vec first and run backward
+
+loop48:
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead
+ XLDR xp3, [ptr+pos+32] ;Initialize xp3 with P2 src + 32B ahead
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+ pxor xq3, xq3 ;q3 = 0
+
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xp3, xs3 ; p3 ^= s2
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xq3, xs3 ; q3 ^= s3
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pxor xtmp3, xtmp3 ; xtmp3 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ pand xtmp3, xpoly ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ paddb xq3, xq3 ; q3 = q3<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ pxor xp3, xs3 ;p3 ^= s3[0]
+ pxor xq3, xs3 ;q3 ^= 1 * s3[0]
+
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src
+ XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead
+ XLDR xtmp3, [tmp+pos+32] ;re-init xq3 with Q2 src + 32B ahead
+
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+ pxor xq2, xtmp2
+ pxor xq3, xtmp3
+
+ por xp1, xq1 ;Confirm that all P&Q parity are 0
+ por xp1, xp2
+ por xp1, xq2
+ por xp1, xp3
+ por xp1, xq3
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ pxor xq1, xq1 ;q = 0
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q ^= s
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+
+ por xp1, xq1 ;Confirm that all P&Q parity are = 0
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_check_sse, 00, 06, 0033
diff --git a/src/isa-l/raid/pq_check_sse_i32.asm b/src/isa-l/raid/pq_check_sse_i32.asm
new file mode 100644
index 00000000..6c5915f9
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse_i32.asm
@@ -0,0 +1,282 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define PS 8
+ %define tmp r11
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 edx
+ %define arg1 ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg0, arg(0)
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer?
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4 ; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+%ifidn PS,8
+ movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg
+%endif
+ cmp len, 32
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 32 ;Do end of vec first and run backward
+
+loop32:
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src
+ XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead
+
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+ pxor xq2, xtmp2
+
+ por xp1, xq1 ;Confirm that all P&Q parity are 0
+ por xp1, xp2
+ por xp1, xq2
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 32
+ cmp pos, len
+ jle loop32
+
+
+ ;; ------------------------------
+ ;; Do last 16 Bytes remaining
+ add len, 32
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ pxor xq1, xq1 ;q = 0
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q ^= s
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+
+ por xp1, xq1 ;Confirm that all P&Q parity are = 0
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_check_sse, 00, 06, 0033
diff --git a/src/isa-l/raid/pq_check_test.c b/src/isa-l/raid/pq_check_test.c
new file mode 100644
index 00000000..8b6d0a1f
--- /dev/null
+++ b/src/isa-l/raid/pq_check_test.c
@@ -0,0 +1,304 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int ref_multi_pq(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char p, q, s;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ p ^= s = src[j][i];
+ q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); // mult by GF{2}
+ }
+
+ src[vects - 2][i] = p; // second to last pointer is p
+ src[vects - 1][i] = q; // last pointer is q
+ }
+ return 0;
+}
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 2];
+ char c;
+ char *tmp_buf[TEST_SOURCES + 2];
+ int serr, lerr;
+
+ printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 16, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("\nfail zero test %d\n", ret);
+ }
+
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test %d\n", ret);
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ ((char *)buffs[j])[i] = 0x5; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = 0; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("fail first rand test %d\n", ret);
+ }
+
+ c = ((char *)(buffs[0]))[TEST_LEN - 2];
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nFail corrupt buffer test, passed when should have failed\n");
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources w/ random data
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ // Check it still passes
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) { // should pass
+ fail++;
+ printf
+ ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+ j, i);
+ return 1;
+ }
+ c = ((char *)buffs[j])[i];
+ ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) { // Check it now fails
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = c; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test various number of sources, full length
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ // New random data
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ // Generate p,q parity for this number of sources
+ ref_multi_pq(j, TEST_LEN, buffs);
+
+ // Set errors up in each source and len position
+ for (i = 0; i < j; i++) {
+ for (k = 0; k < TEST_LEN; k++) {
+ // See if it still passes
+ ret = pq_check(j, TEST_LEN, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand fixed len test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[i])[k];
+ ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer
+
+ ret = pq_check(j, TEST_LEN, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand fixed len test corrupted buffer %d sources\n",
+ j);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[i])[k] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 16;
+ while (k <= TEST_LEN) {
+ char *tmp;
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ // Generate p,q parity for this number of sources
+ ref_multi_pq(j, k, buffs);
+
+ // Inject errors at various source and len positions
+ for (lerr = 0; lerr < k; lerr++) {
+ for (serr = 0; serr < j; serr++) {
+ // See if it still passes
+ ret = pq_check(j, k, buffs);
+ if (ret != 0) { // Should pass
+ printf
+ ("\nfail rand var src, len test %d sources, len=%d\n",
+ j, k);
+ fail++;
+ return 1;
+ }
+
+ tmp = (char *)buffs[serr];
+ c = tmp[lerr];
+ ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer
+
+ ret = pq_check(j, k, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand var src, len test corrupted buffer "
+ "%d sources, len=%d, ret=%d\n", j, k,
+ ret);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ fflush(0);
+ }
+ k += 16;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 16) {
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ rand_buffer(buffs[j], TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+
+ // Test good data
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+ if (ret != 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ fail++;
+ return 1;
+ }
+ // Test bad data
+ for (serr = 0; serr < TEST_SOURCES + 2; serr++) {
+ for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+ c = tmp_buf[serr][lerr];
+ tmp_buf[serr][lerr] = c ^ 1;
+
+ ret =
+ pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+ if (ret == 0) {
+ printf("fail end test corrupted buffer - "
+ "offset: %d, len: %d, ret: %d\n", i,
+ TEST_LEN - i, ret);
+ fail++;
+ return 1;
+ }
+
+ tmp_buf[serr][lerr] = c;
+ }
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (fail == 0)
+ printf("Pass\n");
+
+ return fail;
+
+}
diff --git a/src/isa-l/raid/pq_gen_avx.asm b/src/isa-l/raid/pq_gen_avx.asm
new file mode 100644
index 00000000..43c31a52
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 8*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm14, 6*16
+ save_xmm128 xmm15, 7*16
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm14, [rsp + 6*16]
+ movdqa xmm15, [rsp + 7*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xzero xmm14
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx:function
+func(pq_gen_avx)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ vmovdqa xpoly, [poly]
+ vpxor xzero, xzero, xzero
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Len points to last block
+
+loop48:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p1 = 0
+ vpxor xp2, xp2, xp2 ;p2 = 0
+ vpxor xp3, xp3, xp3 ;p3 = 0
+ vpxor xq1, xq1, xq1 ;q1 = 0
+ vpxor xq2, xq2, xq2 ;q2 = 0
+ vpxor xq3, xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpxor xq2, xq2, xs2 ; q2 ^= s2
+ vpxor xq3, xq3, xs3 ; q3 ^= s3
+ vpxor xp1, xp1, xs1 ; p1 ^= s1
+ vpxor xp2, xp2, xs2 ; p2 ^= s2
+ vpxor xp3, xp3, xs3 ; p3 ^= s2
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+ vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpaddb xq3, xq3, xq3 ; q3 = q3<<1
+ vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxor xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ vpxor xp3, xp3, xs3 ;p3 ^= s3[0]
+ vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+32], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p = 0
+ vpxor xq1, xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpxor xp1, xp1, xs1 ; p ^= s
+ vpaddb xq1, xq1, xq1 ; q = q<<1
+ vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_avx, 02, 0a, 0039
diff --git a/src/isa-l/raid/pq_gen_avx2.asm b/src/isa-l/raid/pq_gen_avx2.asm
new file mode 100644
index 00000000..96797a62
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx2.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 32 bytes. Length must be 32 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 8*32 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ ;; Until a sav_ymm256 is defined
+ vmovdqu [rsp + 0*32], ymm6
+ vmovdqu [rsp + 1*32], ymm7
+ vmovdqu [rsp + 2*32], ymm8
+ vmovdqu [rsp + 3*32], ymm9
+ vmovdqu [rsp + 4*32], ymm10
+ vmovdqu [rsp + 5*32], ymm11
+ vmovdqu [rsp + 6*32], ymm14
+ vmovdqu [rsp + 7*32], ymm15
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqu ymm6, [rsp + 0*32]
+ vmovdqu ymm7, [rsp + 1*32]
+ vmovdqu ymm8, [rsp + 2*32]
+ vmovdqu ymm9, [rsp + 3*32]
+ vmovdqu ymm10, [rsp + 4*32]
+ vmovdqu ymm11, [rsp + 5*32]
+ vmovdqu ymm14, [rsp + 6*32]
+ vmovdqu ymm15, [rsp + 7*32]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 ymm0
+%define xq1 ymm1
+%define xtmp1 ymm2
+%define xs1 ymm3
+
+%define xp2 ymm4
+%define xq2 ymm5
+%define xtmp2 ymm6
+%define xs2 ymm7
+
+%define xp3 ymm8
+%define xq3 ymm9
+%define xtmp3 ymm10
+%define xs3 ymm11
+
+%define xzero ymm14
+%define xpoly ymm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx2:function
+func(pq_gen_avx2)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (32-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ vmovdqa xpoly, [poly]
+ vpxor xzero, xzero, xzero
+ cmp len, 96
+ jl loop32
+
+len_aligned_32bytes:
+ sub len, 3*32 ;Len points to last block
+
+loop96:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+32] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+64] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p1 = 0
+ vpxor xp2, xp2, xp2 ;p2 = 0
+ vpxor xp3, xp3, xp3 ;p3 = 0
+ vpxor xq1, xq1, xq1 ;q1 = 0
+ vpxor xq2, xq2, xq2 ;q2 = 0
+ vpxor xq3, xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpxor xq2, xq2, xs2 ; q2 ^= s2
+ vpxor xq3, xq3, xs3 ; q3 ^= s3
+ vpxor xp1, xp1, xs1 ; p1 ^= s1
+ vpxor xp2, xp2, xs2 ; p2 ^= s2
+ vpxor xp3, xp3, xs3 ; p3 ^= s2
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+ vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+32] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+64] ; Get next vector (source data3)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpaddb xq3, xq3, xq3 ; q3 = q3<<1
+ vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxor xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ vpxor xp3, xp3, xs3 ;p3 ^= s3[0]
+ vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+32], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+64], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+32], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+64], xq3 ;Write parity Q3 vector
+ add pos, 3*32
+ cmp pos, len
+ jle loop96
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 3*32
+ cmp pos, len
+ je return_pass
+
+loop32:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p = 0
+ vpxor xq1, xq1, xq1 ;q = 0
+
+next_vect32:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpxor xp1, xp1, xs1 ; p ^= s
+ vpaddb xq1, xq1, xq1 ; q = q<<1
+ vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect32 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 32
+ cmp pos, len
+ jl loop32
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 32
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_avx2, 04, 03, 0041
diff --git a/src/isa-l/raid/pq_gen_avx512.asm b/src/isa-l/raid/pq_gen_avx512.asm
new file mode 100644
index 00000000..ac7b29f9
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx512.asm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX512
+;;; int pq_gen_avx512(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined.
+;;; Length must be 32 byte multiple.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 4*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+ vmovdqu [rsp + 2*16], xmm8
+ vmovdqu [rsp + 3*16], xmm9
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ vmovdqu xmm8, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 3*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 zmm0
+%define xq1 zmm1
+%define xtmp1 zmm2
+%define xs1 zmm3
+
+%define xp2 zmm4
+%define xq2 zmm5
+%define xtmp2 zmm6
+%define xs2 zmm7
+
+%define xzero zmm8
+%define xpoly zmm9
+
+%define xp1y ymm0
+%define xq1y ymm1
+%define xtmp1y ymm2
+%define xs1y ymm3
+%define xzeroy ymm8
+%define xpolyy ymm9
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8 ;u8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx512:function
+func(pq_gen_avx512)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (32-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ mov tmp, 0x1d
+ vpbroadcastb xpoly, tmp
+ vpxorq xzero, xzero, xzero
+ cmp len, 128
+ jl loop32
+
+len_aligned_32bytes:
+ sub len, 2*64 ;Len points to last block
+
+loop128:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+64] ;Preload last vector (source)
+ vpxorq xp1, xp1, xp1 ;p1 = 0
+ vpxorq xp2, xp2, xp2 ;p2 = 0
+ vpxorq xq1, xq1, xq1 ;q1 = 0
+ vpxorq xq2, xq2, xq2 ;q2 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxorq xq1, xq1, xs1 ; q1 ^= s1
+ vpxorq xq2, xq2, xs2 ; q2 ^= s2
+ vpxorq xp1, xp1, xs1 ; p1 ^= s1
+ vpxorq xp2, xp2, xs2 ; p2 ^= s2
+ vpcmpb k1, xq1, xzero, 1
+ vpcmpb k2, xq2, xzero, 1
+ vpblendmb xtmp1 {k1}, xzero, xpoly
+ vpblendmb xtmp2 {k2}, xzero, xpoly
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+64] ; Get next vector (source data2)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpxorq xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxorq xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxorq xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxorq xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxorq xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxorq xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+64], xp2 ;Write parity P2 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+64], xq2 ;Write parity Q2 vector
+ add pos, 2*64
+ cmp pos, len
+ jle loop128
+
+ ;; ------------------------------
+ ;; Do last 32 or 64 Bytes remaining
+ add len, 2*64
+ cmp pos, len
+ je return_pass
+
+loop32:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1y, [ptr+pos] ;Preload last vector (source)
+ vpxorq xp1y, xp1y, xp1y ;p = 0
+ vpxorq xq1y, xq1y, xq1y ;q = 0
+
+next_vect32:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxorq xq1y, xq1y, xs1y ; q1 ^= s1
+ vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00
+ vpxorq xp1y, xp1y, xs1y ; p ^= s
+ vpaddb xq1y, xq1y, xq1y ; q = q<<1
+ vpxorq xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked
+ XLDR xs1y, [ptr+pos] ; Get next vector (source data)
+ jg next_vect32 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
+ vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1y ;Write parity P vector
+ XSTR [tmp+pos], xq1y ;Write parity Q vector
+ add pos, 32
+ cmp pos, len
+ jl loop32
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/pq_gen_perf.c b/src/isa-l/raid/pq_gen_perf.c
new file mode 100644
index 00000000..194f2604
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_perf.c
@@ -0,0 +1,97 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 800000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test. Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+# define TEST_LOOPS 1000
+# define TEST_TYPE_STR "_cold"
+# else
+# define TEST_TYPE_STR "_cus"
+# ifndef TEST_LOOPS
+# define TEST_LOOPS 1000
+# endif
+# endif
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+ int i;
+ void *buffs[TEST_SOURCES + 2];
+ struct perf start, stop;
+
+ printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ int ret;
+ void *buf;
+ ret = posix_memalign(&buf, 64, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Setup data
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ // Warm up
+ pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+ perf_stop(&stop);
+ printf("pq_gen" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ return 0;
+}
diff --git a/src/isa-l/raid/pq_gen_sse.asm b/src/isa-l/raid/pq_gen_sse.asm
new file mode 100644
index 00000000..1426f3f5
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 7*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm15, 6*16
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm15, [rsp + 6*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ movdqa xpoly, [poly]
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Len points to last block
+
+loop48:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+ pxor xp1, xp1 ;p1 = 0
+ pxor xp2, xp2 ;p2 = 0
+ pxor xp3, xp3 ;p3 = 0
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+ pxor xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xq3, xs3 ; q3 ^= s3
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xp3, xs3 ; p3 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pxor xtmp3, xtmp3 ; xtmp3 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ pand xtmp3, xpoly ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ paddb xq3, xq3 ; q3 = q3<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ pxor xp3, xs3 ;p3 ^= s3[0]
+ pxor xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+32], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ pxor xp1, xp1 ;p = 0
+ pxor xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_sse, 00, 09, 0032
diff --git a/src/isa-l/raid/pq_gen_sse_i32.asm b/src/isa-l/raid/pq_gen_sse_i32.asm
new file mode 100644
index 00000000..16093d52
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse_i32.asm
@@ -0,0 +1,264 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define PS 8
+ %define tmp r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 edx
+ %define arg1 ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg0, arg(0)
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer?
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4 ; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+%ifidn PS,8
+ movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg
+%endif
+ cmp len, 32
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 32 ;Do end of vec first and run backward
+
+loop32:
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ pxor xp1, xp1 ;p1 = 0
+ pxor xq1, xq1 ;q1 = 0
+ pxor xp2, xp2 ;p2 = 0
+ pxor xq2, xq2 ;q2 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ add pos, 32
+ cmp pos, len
+ jle loop32
+
+ ;; ------------------------------
+ ;; Do last 16 Bytes remaining
+ add len, 32
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ pxor xp1, xp1 ;p = 0
+ pxor xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_sse, 00, 08, 0032
diff --git a/src/isa-l/raid/pq_gen_test.c b/src/isa-l/raid/pq_gen_test.c
new file mode 100644
index 00000000..d0844964
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_test.c
@@ -0,0 +1,194 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<limits.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int dump(unsigned char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", buf[i++]);
+ if (i % 16 == 0)
+ printf("\n");
+ }
+ printf("\n");
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 2]; // Pointers to src and dest
+ char *tmp_buf[TEST_SOURCES + 2];
+
+ printf("Test pq_gen_test ");
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 32, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES])[i] != 0)
+ fail++;
+ }
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES + 1])[i] != 0)
+ fail++;
+ }
+
+ if (fail > 0) {
+ printf("fail zero test %d\n", fail);
+ return 1;
+ } else
+ putchar('.');
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+ fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ int t;
+ printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret);
+ for (t = 0; t < TEST_SOURCES + 2; t++)
+ dump(buffs[t], 15);
+
+ printf(" reference function p,q\n");
+ pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+ for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++)
+ dump(buffs[t], 15);
+
+ return 1;
+ } else
+ putchar('.');
+
+ // Test various number of sources
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ pq_gen(j, TEST_LEN, buffs);
+ fail |= pq_check_base(j, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources\n", j);
+ return 1;
+ } else
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ ret = pq_gen(j, k, buffs);
+ fail |= pq_check_base(j, k, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources, len=%d, fail="
+ "%d, ret=%d\n", j, k, fail, ret);
+ return 1;
+ }
+ }
+ putchar('.');
+ k += 32;
+ }
+
+ // Test at the end of buffer
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 0; j < (TEST_SOURCES + 2); j++) {
+ rand_buffer(buffs[j], TEST_LEN - k);
+ tmp_buf[j] = (char *)buffs[j] + k;
+ }
+
+ ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+ fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+
+ if (fail > 0) {
+ printf("fail end test - offset: %d, len: %d, fail: %d, "
+ "ret: %d\n", k, TEST_LEN - k, fail, ret);
+ return 1;
+ }
+
+ putchar('.');
+ fflush(0);
+ k += 32;
+ }
+
+ if (!fail)
+ printf(" done: Pass\n");
+
+ return fail;
+}
diff --git a/src/isa-l/raid/raid_base.c b/src/isa-l/raid/raid_base.c
new file mode 100644
index 00000000..25c19331
--- /dev/null
+++ b/src/isa-l/raid/raid_base.c
@@ -0,0 +1,147 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <limits.h>
+#include <stdint.h>
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+# define notbit0 0xfefefefefefefefeULL
+# define bit7 0x8080808080808080ULL
+# define gf8poly 0x1d1d1d1d1d1d1d1dULL
+#else
+# define notbit0 0xfefefefeUL
+# define bit7 0x80808080UL
+# define gf8poly 0x1d1d1d1dUL
+#endif
+
+int pq_gen_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned long p, q, s;
+ unsigned long **src = (unsigned long **)array;
+ int blocks = len / sizeof(long);
+
+ for (i = 0; i < blocks; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ p ^= s = src[j][i];
+ q = s ^ (((q << 1) & notbit0) ^ // shift each byte
+ ((((q & bit7) << 1) - ((q & bit7) >> 7)) // mask out bytes
+ & gf8poly)); // apply poly
+ }
+
+ src[vects - 2][i] = p; // second to last pointer is p
+ src[vects - 1][i] = q; // last pointer is q
+ }
+ return 0;
+}
+
+int pq_check_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char p, q, s;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ s = src[j][i];
+ p ^= s;
+
+ // mult by GF{2}
+ q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));
+ }
+
+ if (src[vects - 2][i] != p) // second to last pointer is p
+ return i | 1;
+ if (src[vects - 1][i] != q) // last pointer is q
+ return i | 2;
+ }
+ return 0;
+}
+
+int xor_gen_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char parity;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ parity = src[0][i];
+ for (j = 1; j < vects - 1; j++)
+ parity ^= src[j][i];
+
+ src[vects - 1][i] = parity; // last pointer is dest
+
+ }
+
+ return 0;
+}
+
+int xor_check_base(int vects, int len, void **array)
+{
+ int i, j, fail = 0;
+
+ unsigned char parity;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ parity = 0;
+ for (j = 0; j < vects; j++)
+ parity ^= src[j][i];
+
+ if (parity != 0) {
+ fail = 1;
+ break;
+ }
+ }
+ if (fail && len > 0)
+ return len;
+ return fail;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver pq_gen_base_slver_0001012a;
+struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 };
+
+struct slver xor_gen_base_slver_0001012b;
+struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 };
+
+struct slver pq_check_base_slver_0001012c;
+struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 };
+
+struct slver xor_check_base_slver_0001012d;
+struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 };
diff --git a/src/isa-l/raid/raid_base_aliases.c b/src/isa-l/raid/raid_base_aliases.c
new file mode 100644
index 00000000..f81792a0
--- /dev/null
+++ b/src/isa-l/raid/raid_base_aliases.c
@@ -0,0 +1,50 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "raid.h"
+
+int pq_gen(int vects, int len, void **array)
+{
+ return pq_gen_base(vects, len, array);
+}
+
+int pq_check(int vects, int len, void **array)
+{
+ return pq_check_base(vects, len, array);
+}
+
+int xor_gen(int vects, int len, void **array)
+{
+ return xor_gen_base(vects, len, array);
+}
+
+int xor_check(int vects, int len, void **array)
+{
+ return xor_check_base(vects, len, array);
+}
diff --git a/src/isa-l/raid/raid_multibinary.asm b/src/isa-l/raid/raid_multibinary.asm
new file mode 100644
index 00000000..72ef5d40
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+default rel
+[bits 64]
+
+extern pq_gen_base
+extern pq_gen_sse
+extern pq_gen_avx
+extern pq_gen_avx2
+
+extern xor_gen_base
+extern xor_gen_sse
+extern xor_gen_avx
+
+extern pq_check_base
+extern pq_check_sse
+
+extern xor_check_base
+extern xor_check_sse
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern xor_gen_avx512
+ extern pq_gen_avx512
+%endif
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx, xor_gen_avx512
+ mbin_dispatch_init6 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2, pq_gen_avx512
+%else
+ mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx
+ mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2
+%endif
+
+section .data
+
+xor_check_dispatched:
+ dq xor_check_mbinit
+pq_check_dispatched:
+ dq pq_check_mbinit
+
+section .text
+
+;;;;
+; pq_check multibinary function
+;;;;
+global pq_check:function
+pq_check_mbinit:
+ call pq_check_dispatch_init
+pq_check:
+ jmp qword [pq_check_dispatched]
+
+pq_check_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [pq_check_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea rbx, [pq_check_sse WRT_OPT]
+ cmovne rsi, rbx
+
+ mov [pq_check_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+
+;;;;
+; xor_check multibinary function
+;;;;
+global xor_check:function
+xor_check_mbinit:
+ call xor_check_dispatch_init
+xor_check:
+ jmp qword [xor_check_dispatched]
+
+xor_check_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [xor_check_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea rbx, [xor_check_sse WRT_OPT]
+ cmovne rsi, rbx
+
+ mov [xor_check_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;; func core, ver, snum
+slversion xor_gen, 00, 03, 0126
+slversion xor_check, 00, 03, 0127
+slversion pq_gen, 00, 03, 0128
+slversion pq_check, 00, 03, 0129
diff --git a/src/isa-l/raid/raid_multibinary_i32.asm b/src/isa-l/raid/raid_multibinary_i32.asm
new file mode 100644
index 00000000..6da4c9dc
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary_i32.asm
@@ -0,0 +1,58 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+[bits 32]
+
+extern xor_gen_base
+extern xor_gen_sse
+extern pq_gen_base
+extern pq_gen_sse
+extern xor_check_base
+extern xor_check_sse
+extern pq_check_base
+extern pq_check_sse
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+mbin_interface xor_check
+mbin_interface pq_check
+
+mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_sse, xor_gen_sse
+mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_sse, pq_gen_sse
+mbin_dispatch_init5 xor_check, xor_check_base, xor_check_sse, xor_check_sse, xor_check_sse
+mbin_dispatch_init5 pq_check, pq_check_base, pq_check_sse, pq_check_sse, pq_check_sse
diff --git a/src/isa-l/raid/xor_check_sse.asm b/src/isa-l/raid/xor_check_sse.asm
new file mode 100644
index 00000000..65ae2f77
--- /dev/null
+++ b/src/isa-l/raid/xor_check_sse.asm
@@ -0,0 +1,285 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 rax
+ %define tmp2.b al
+ %define tmp3 arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define tmp2 rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp r11
+ %define tmp3 r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 arg(0)
+ %define arg1 ecx
+ %define tmp2 eax
+ %define tmp2.b al
+ %define tmp3 edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_check_sse:function
+func(xor_check_sse)
+ FUNC_SAVE
+%ifidn PS,8 ;64-bit code
+ sub vec, 1 ; Keep as offset to last source
+%else ;32-bit code
+ mov tmp, arg(0) ; Update vec length arg to last source
+ sub tmp, 1
+ mov arg(0), tmp
+%endif
+
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+ mov tmp, vec ;Preset to last vector
+
+loop128:
+ mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7
+ XLDR xmm2, [tmp2+pos+(2*16)]
+ XLDR xmm3, [tmp2+pos+(3*16)]
+ XLDR xmm4, [tmp2+pos+(4*16)]
+ XLDR xmm5, [tmp2+pos+(5*16)]
+ XLDR xmm6, [tmp2+pos+(6*16)]
+ XLDR xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ xorpd xmm0, [ptr+pos] ;Get next vector (source)
+ xorpd xmm1, [ptr+pos+16]
+ xorpd xmm2, [ptr+pos+(2*16)]
+ xorpd xmm3, [ptr+pos+(3*16)]
+ xorpd xmm4, [ptr+pos+(4*16)]
+ xorpd xmm5, [ptr+pos+(5*16)]
+ xorpd xmm6, [ptr+pos+(6*16)]
+ xorpd xmm7, [ptr+pos+(7*16)]
+;;; prefetch [ptr+pos+(8*16)]
+ jge next_vect ;Loop for each vect
+
+ ;; End of vects, chech that all parity regs = 0
+ mov tmp, vec ;Back to last vector
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+ por xmm0, xmm4
+ por xmm0, xmm5
+ por xmm0, xmm6
+ por xmm0, xmm7
+ ptest xmm0, xmm0
+ jnz return_fail
+
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+ mov tmp, vec ;Preset to last vector
+
+loop_1byte:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec ;Back to last vector
+ cmp tmp2.b, 0
+ jne return_fail
+ sub len, 1
+ test len, (8-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne xor_gen_byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time
+ mov tmp, vec ;Preset to last vector
+
+ ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_Nbytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_Nbytes ;Loop for each source
+
+ mov tmp, vec ;Back to last vector
+ cmp tmp2, 0
+ jne return_fail
+ sub len, PS
+ sub tmp3, PS
+ jg loopN_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_check_sse, 00, 03, 0031
+
diff --git a/src/isa-l/raid/xor_check_test.c b/src/isa-l/raid/xor_check_test.c
new file mode 100644
index 00000000..dfb571a6
--- /dev/null
+++ b/src/isa-l/raid/xor_check_test.c
@@ -0,0 +1,280 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 1];
+ char c;
+ int serr, lerr;
+ char *tmp_buf[TEST_SOURCES + 1];
+
+ printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 16, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("\nfail zero test %d\n", ret);
+ }
+
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test %d\n", ret);
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ ((char *)buffs[j])[i] = 0x5; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = 0; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("fail first rand test %d\n", ret);
+ }
+
+ c = ((char *)(buffs[0]))[TEST_LEN - 2];
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nFail corrupt buffer test, passed when should have failed\n");
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources w/ random data
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ // Check it still passes
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) { // should pass
+ fail++;
+ printf
+ ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+ j, i);
+ return 1;
+ }
+ c = ((char *)buffs[j])[i];
+ ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) { // Check it now fails
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = c; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test various number of sources, full length
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ // New random data
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ // Generate xor parity for this number of sources
+ xor_gen_base(j, TEST_LEN, buffs);
+
+ // Set errors up in each source and len position
+ for (i = 0; i < j; i++) {
+ for (k = 0; k < TEST_LEN; k++) {
+ // See if it still passes
+ ret = xor_check(j, TEST_LEN, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[i])[k];
+ ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer
+
+ ret = xor_check(j, TEST_LEN, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand test corrupted buffer %d sources\n",
+ j);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[i])[k] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 1;
+ while (k <= TEST_LEN) {
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ // Generate xor parity for this number of sources
+ xor_gen_base(j, k, buffs);
+
+ // Inject errors at various source and len positions
+ for (lerr = 0; lerr < k; lerr += 10) {
+ for (serr = 0; serr < j; serr++) {
+
+ // See if it still passes
+ ret = xor_check(j, k, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[serr])[lerr];
+ ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer
+
+ ret = xor_check(j, k, buffs);
+ if (ret == 0) { // Should fail
+ printf("\nfail rand test corrupted buffer "
+ "%d sources, len=%d, ret=%d\n", j, k,
+ ret);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer
+ }
+ }
+ }
+ putchar('.');
+ fflush(0);
+ k += 1;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 32) {
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ rand_buffer(buffs[j], TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+ // Test good data
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ if (ret != 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ fail++;
+ return 1;
+ }
+ // Test bad data
+ for (serr = 0; serr < TEST_SOURCES + 1; serr++) {
+ for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+ c = tmp_buf[serr][lerr];
+ tmp_buf[serr][lerr] = c ^ 1;
+
+ ret =
+ xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ if (ret == 0) {
+ printf("fail end test corrupted buffer - "
+ "offset: %d, len: %d, ret: %d\n", i,
+ TEST_LEN - i, ret);
+ fail++;
+ return 1;
+ }
+
+ tmp_buf[serr][lerr] = c;
+ }
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (fail == 0)
+ printf("Pass\n");
+
+ return fail;
+
+}
diff --git a/src/isa-l/raid/xor_example.c b/src/isa-l/raid/xor_example.c
new file mode 100644
index 00000000..d328c314
--- /dev/null
+++ b/src/isa-l/raid/xor_example.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 16*1024
+
+int main(int argc, char *argv[])
+{
+ int i, j, should_pass, should_fail;
+ void *buffs[TEST_SOURCES + 1];
+
+ printf("XOR example\n");
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 16, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ printf("Make random data\n");
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ for (j = 0; j < TEST_LEN; j++)
+ ((char *)buffs[i])[j] = rand();
+
+ printf("Generate xor parity\n");
+ xor_gen_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ printf("Check parity: ");
+ should_pass = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+ printf("%s\n", should_pass == 0 ? "Pass" : "Fail");
+
+ printf("Find corruption: ");
+ ((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1; // flip one bit
+ should_fail = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs); //recheck
+ printf("%s\n", should_fail != 0 ? "Pass" : "Fail");
+
+ return 0;
+}
diff --git a/src/isa-l/raid/xor_gen_avx.asm b/src/isa-l/raid/xor_gen_avx.asm
new file mode 100644
index 00000000..536ab3e2
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx.asm
@@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX
+;;; int xor_gen_avx(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define func(x) x:
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size 2*32 + 8 ;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*32], ymm6
+ vmovdqu [rsp + 1*32], ymm7
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ vmovdqu ymm6, [rsp + 0*32]
+ vmovdqu ymm7, [rsp + 1*32]
+ add rsp, stack_size
+ %endmacro
+
+%endif ;output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global xor_gen_avx:function
+func(xor_gen_avx)
+
+ FUNC_SAVE
+ sub vec, 2 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+
+loop128:
+ mov tmp, vec ;Back to last vector
+ mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR ymm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR ymm1, [tmp2+pos+32] ;Keep xor parity in xmm0-7
+ XLDR ymm2, [tmp2+pos+(2*32)]
+ XLDR ymm3, [tmp2+pos+(3*32)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ XLDR ymm4, [ptr+pos] ;Get next vector (source)
+ XLDR ymm5, [ptr+pos+32]
+ XLDR ymm6, [ptr+pos+(2*32)]
+ XLDR ymm7, [ptr+pos+(3*32)]
+ vxorpd ymm0, ymm0, ymm4 ;Add to xor parity
+ vxorpd ymm1, ymm1, ymm5
+ vxorpd ymm2, ymm2, ymm6
+ vxorpd ymm3, ymm3, ymm7
+ jge next_vect ;Loop for each source
+
+ mov ptr, [arg2+PS+vec*PS] ;Address of parity vector
+ XSTR [ptr+pos], ymm0 ;Write parity xor vector
+ XSTR [ptr+pos+(1*32)], ymm1
+ XSTR [ptr+pos+(2*32)], ymm2
+ XSTR [ptr+pos+(3*32)], ymm3
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (PS-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne loop_1byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 8 at a time
+
+ ;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_8bytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_8bytes ;Loop for each source
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loop8_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ FUNC_RESTORE
+ mov return, 1
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_gen_avx, 02, 05, 0037
+
diff --git a/src/isa-l/raid/xor_gen_avx512.asm b/src/isa-l/raid/xor_gen_avx512.asm
new file mode 100644
index 00000000..6892f85c
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx512.asm
@@ -0,0 +1,217 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX512
+;;; int xor_gen_avx512(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define func(x) x:
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size 2*16 + 8 ;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*316]
+ add rsp, stack_size
+ %endmacro
+
+%endif ;output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global xor_gen_avx512:function
+func(xor_gen_avx512)
+ FUNC_SAVE
+ sub vec, 2 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+
+loop128:
+ mov tmp, vec ;Back to last vector
+ mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR zmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR zmm1, [tmp2+pos+64] ;Keep xor parity in xmm0-7
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ XLDR zmm4, [ptr+pos] ;Get next vector (source)
+ XLDR zmm5, [ptr+pos+64]
+ vpxorq zmm0, zmm0, zmm4 ;Add to xor parity
+ vpxorq zmm1, zmm1, zmm5
+ jge next_vect ;Loop for each source
+
+ mov ptr, [arg2+PS+vec*PS] ;Address of parity vector
+ XSTR [ptr+pos], zmm0 ;Write parity xor vector
+ XSTR [ptr+pos+64], zmm1
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (PS-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne loop_1byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 8 at a time
+
+ ;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_8bytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_8bytes ;Loop for each source
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loop8_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ FUNC_RESTORE
+ mov return, 1
+ ret
+
+endproc_frame
+
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/xor_gen_perf.c b/src/isa-l/raid/xor_gen_perf.c
new file mode 100644
index 00000000..25b33cb6
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_perf.c
@@ -0,0 +1,98 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_SOURCES 10
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 2000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN GT_L3_CACHE / TEST_SOURCES
+# define TEST_LOOPS 1000
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+ int i, ret, fail = 0;
+ void **buffs;
+ void *buff;
+ struct perf start, stop;
+
+ printf("Test xor_gen_perf\n");
+
+ ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs = buff;
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 64, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Setup data
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ // Warm up
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+ perf_stop(&stop);
+ printf("xor_gen" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ return fail;
+}
diff --git a/src/isa-l/raid/xor_gen_sse.asm b/src/isa-l/raid/xor_gen_sse.asm
new file mode 100644
index 00000000..2fd6faeb
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_sse.asm
@@ -0,0 +1,284 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 rax
+ %define tmp2.b al
+ %define tmp3 arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define tmp2 rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp r11
+ %define tmp3 r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 arg(0)
+ %define arg1 ecx
+ %define tmp2 eax
+ %define tmp2.b al
+ %define tmp3 edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_gen_sse:function
+func(xor_gen_sse)
+ FUNC_SAVE
+%ifidn PS,8 ;64-bit code
+ sub vec, 2 ; Keep as offset to last source
+%else ;32-bit code
+ mov tmp, arg(0) ; Update vec length arg to last source
+ sub tmp, 2
+ mov arg(0), tmp
+%endif
+
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+ mov tmp, vec ;Preset to last vector
+
+loop128:
+ mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7
+ XLDR xmm2, [tmp2+pos+(2*16)]
+ XLDR xmm3, [tmp2+pos+(3*16)]
+ XLDR xmm4, [tmp2+pos+(4*16)]
+ XLDR xmm5, [tmp2+pos+(5*16)]
+ XLDR xmm6, [tmp2+pos+(6*16)]
+ XLDR xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ xorpd xmm0, [ptr+pos] ;Get next vector (source)
+ xorpd xmm1, [ptr+pos+16]
+ xorpd xmm2, [ptr+pos+(2*16)]
+ xorpd xmm3, [ptr+pos+(3*16)]
+ xorpd xmm4, [ptr+pos+(4*16)]
+ xorpd xmm5, [ptr+pos+(5*16)]
+ xorpd xmm6, [ptr+pos+(6*16)]
+ xorpd xmm7, [ptr+pos+(7*16)]
+;;; prefetch [ptr+pos+(8*16)]
+ jge next_vect ;Loop for each vect
+
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Address of parity vector
+ XSTR [ptr+pos], xmm0 ;Write parity xor vector
+ XSTR [ptr+pos+(1*16)], xmm1
+ XSTR [ptr+pos+(2*16)], xmm2
+ XSTR [ptr+pos+(3*16)], xmm3
+ XSTR [ptr+pos+(4*16)], xmm4
+ XSTR [ptr+pos+(5*16)], xmm5
+ XSTR [ptr+pos+(6*16)], xmm6
+ XSTR [ptr+pos+(7*16)], xmm7
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+ mov tmp, vec ;Preset to last vector
+
+loop_1byte:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Get last vec
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (8-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne xor_gen_byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time
+ mov tmp, vec ;Preset to last vector
+
+ ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_Nbytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_Nbytes ;Loop for each source
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Get last vec
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loopN_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_gen_sse, 00, 0c, 0030
+
diff --git a/src/isa-l/raid/xor_gen_test.c b/src/isa-l/raid/xor_gen_test.c
new file mode 100644
index 00000000..f158f94c
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_test.c
@@ -0,0 +1,165 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 1];
+ char *tmp_buf[TEST_SOURCES + 1];
+
+ printf("Test xor_gen_test ");
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 32, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES])[i] != 0)
+ fail++;
+ }
+
+ if (fail > 0) {
+ printf("fail zero test");
+ return 1;
+ } else
+ putchar('.');
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d\n", fail);
+ return 1;
+ } else
+ putchar('.');
+
+ // Test various number of sources
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen(j, TEST_LEN, buffs);
+ fail |= xor_check_base(j, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources\n", j);
+ return 1;
+ } else
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ xor_gen(j, k, buffs);
+ fail |= xor_check_base(j, k, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources, len=%d, ret=%d\n", j, k,
+ fail);
+ return 1;
+ }
+ }
+ putchar('.');
+ k += 1;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 32) {
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+ if (fail > 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ return 1;
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (!fail)
+ printf(" done: Pass\n");
+
+ return fail;
+}