23 files changed, 4516 insertions, 0 deletions
diff --git a/src/isa-l/raid/Makefile.am b/src/isa-l/raid/Makefile.am
new file mode 100644
index 00000000..95490e2c
--- /dev/null
+++ b/src/isa-l/raid/Makefile.am
@@ -0,0 +1,64 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc        += 	raid/raid_base.c
+
+lsrc_base_aliases += raid/raid_base_aliases.c
+
+lsrc_x86_64 += \
+		raid/xor_gen_sse.asm \
+		raid/pq_gen_sse.asm \
+		raid/xor_check_sse.asm \
+		raid/pq_check_sse.asm \
+		raid/pq_gen_avx.asm \
+		raid/xor_gen_avx.asm \
+		raid/pq_gen_avx2.asm \
+		raid/xor_gen_avx512.asm \
+		raid/pq_gen_avx512.asm \
+		raid/raid_multibinary.asm
+
+lsrc_x86_32 += \
+		raid/xor_gen_sse.asm \
+		raid/pq_gen_sse_i32.asm \
+		raid/xor_check_sse.asm \
+		raid/pq_check_sse_i32.asm \
+		raid/raid_multibinary_i32.asm
+
+
+extern_hdrs +=	include/raid.h
+
+other_src   +=  include/test.h include/types.h
+
+check_tests +=  raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test
+
+perf_tests  +=  raid/xor_gen_perf raid/pq_gen_perf
+
+examples    += 	raid/xor_example
+
+lsrc32       += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c
diff --git a/src/isa-l/raid/pq_check_sse.asm b/src/isa-l/raid/pq_check_sse.asm
new file mode 100644
index 00000000..96a8177a
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse.asm
@@ -0,0 +1,277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_check_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  7*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm15, 6*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	movdqa	xpoly, [poly]
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Do end of vec first and run backward
+
+loop48:
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	XLDR	xp2, [ptr+pos+16]	;Initialize xp2 with P2 src + 16B ahead
+	XLDR	xp3, [ptr+pos+32]	;Initialize xp3 with P2 src + 32B ahead
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+	pxor	xq3, xq3		;q3 = 0
+
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xp3, xs3		; p3 ^= s2
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xq3, xs3		; q3 ^= s3
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pxor	xtmp3, xtmp3		; xtmp3 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp3, xq3		; xtmp3 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	pand	xtmp3, xpoly		; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	paddb	xq3, xq3		; q3 = q3<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	pxor	xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	pxor	xp3, xs3		;p3 ^= s3[0]
+	pxor	xq3, xs3		;q3 ^= 1 * s3[0]
+
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init xq1 with Q1 src
+	XLDR	xtmp2, [tmp+pos+16]	;re-init xq2 with Q2 src + 16B ahead
+	XLDR	xtmp3, [tmp+pos+32]	;re-init xq3 with Q2 src + 32B ahead
+
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+	pxor	xq2, xtmp2
+	pxor	xq3, xtmp3
+
+	por	xp1, xq1		;Confirm that all P&Q parity are 0
+	por	xp1, xp2
+	por	xp1, xq2
+	por	xp1, xp3
+	por	xp1, xq3
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	pxor	xq1, xq1		;q = 0
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q ^= s
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init tmp with Q1 src
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+
+	por	xp1, xq1		;Confirm that all P&Q parity are = 0
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func          core, ver, snum
+slversion pq_check_sse, 00,   06,  0033
diff --git a/src/isa-l/raid/pq_check_sse_i32.asm b/src/isa-l/raid/pq_check_sse_i32.asm
new file mode 100644
index 00000000..6c5915f9
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse_i32.asm
@@ -0,0 +1,282 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define PS 8
+ %define tmp   r11
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   edx
+ %define arg1   ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg0, arg(0)
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer?
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4			; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_check_sse:function
+func(pq_check_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+%ifidn PS,8
+	movdqa	xpoly, [poly]		;For 64-bit, load poly into high xmm reg
+%endif
+	cmp	len, 32
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 32			;Do end of vec first and run backward
+
+loop32:
+	mov 	ptr, [arg2+PS+vec*PS] 	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS]	;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	XLDR	xp2, [ptr+pos+16]	;Initialize xp2 with P2 src + 16B ahead
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init xq1 with Q1 src
+	XLDR	xtmp2, [tmp+pos+16]	;re-init xq2 with Q2 src + 16B ahead
+
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+	pxor	xq2, xtmp2
+
+	por	xp1, xq1		;Confirm that all P&Q parity are 0
+	por	xp1, xp2
+	por	xp1, xq2
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 32
+	cmp	pos, len
+	jle	loop32
+
+
+	;; ------------------------------
+	;; Do last 16 Bytes remaining
+	add	len, 32
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xp1, [ptr+pos]		;Initialize xp1 with P1 src
+	pxor	xq1, xq1		;q = 0
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q ^= s
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	XLDR	xtmp1, [tmp+pos]	;re-init tmp with Q1 src
+	pxor	xq1, xtmp1		;xq1 = q1 calculated ^ q1 saved
+
+	por	xp1, xq1		;Confirm that all P&Q parity are = 0
+	ptest	xp1, xp1
+	jnz	return_fail
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func          core, ver, snum
+slversion pq_check_sse, 00,   06,  0033
diff --git a/src/isa-l/raid/pq_check_test.c b/src/isa-l/raid/pq_check_test.c
new file mode 100644
index 00000000..8b6d0a1f
--- /dev/null
+++ b/src/isa-l/raid/pq_check_test.c
@@ -0,0 +1,304 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int ref_multi_pq(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char p, q, s;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			p ^= s = src[j][i];
+			q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));	// mult by GF{2}
+		}
+
+		src[vects - 2][i] = p;	// second to last pointer is p
+		src[vects - 1][i] = q;	// last pointer is q
+	}
+	return 0;
+}
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 2];
+	char c;
+	char *tmp_buf[TEST_SOURCES + 2];
+	int serr, lerr;
+
+	printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("\nfail zero test %d\n", ret);
+	}
+
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0x7;	// corrupt buffer
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nfail corrupt buffer test %d\n", ret);
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources
+	for (j = 0; j < TEST_SOURCES + 2; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			((char *)buffs[j])[i] = 0x5;	// corrupt buffer
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret == 0) {
+				fail++;
+				printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = 0;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("fail first rand test %d\n", ret);
+	}
+
+	c = ((char *)(buffs[0]))[TEST_LEN - 2];
+	((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+	ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nFail corrupt buffer test, passed when should have failed\n");
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = c;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources w/ random data
+	for (j = 0; j < TEST_SOURCES + 2; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			// Check it still passes
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret != 0) {	// should pass
+				fail++;
+				printf
+				    ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+				     j, i);
+				return 1;
+			}
+			c = ((char *)buffs[j])[i];
+			((char *)buffs[j])[i] = c ^ 1;	// corrupt buffer
+			ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+			if (ret == 0) {	// Check it now fails
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = c;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test various number of sources, full length
+	for (j = 4; j <= TEST_SOURCES + 2; j++) {
+		// New random data
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		// Generate p,q parity for this number of sources
+		ref_multi_pq(j, TEST_LEN, buffs);
+
+		// Set errors up in each source and len position
+		for (i = 0; i < j; i++) {
+			for (k = 0; k < TEST_LEN; k++) {
+				// See if it still passes
+				ret = pq_check(j, TEST_LEN, buffs);
+				if (ret != 0) {	// Should pass
+					printf("\nfail rand fixed len test %d sources\n", j);
+					fail++;
+					return 1;
+				}
+
+				c = ((char *)buffs[i])[k];
+				((char *)buffs[i])[k] = c ^ 1;	// corrupt buffer
+
+				ret = pq_check(j, TEST_LEN, buffs);
+				if (ret == 0) {	// Should fail
+					printf
+					    ("\nfail rand fixed len test corrupted buffer %d sources\n",
+					     j);
+					fail++;
+					return 1;
+				}
+				((char *)buffs[i])[k] = c;	// un-corrupt buffer
+			}
+		}
+		putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 16;
+	while (k <= TEST_LEN) {
+		char *tmp;
+		for (j = 4; j <= TEST_SOURCES + 2; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			// Generate p,q parity for this number of sources
+			ref_multi_pq(j, k, buffs);
+
+			// Inject errors at various source and len positions
+			for (lerr = 0; lerr < k; lerr++) {
+				for (serr = 0; serr < j; serr++) {
+					// See if it still passes
+					ret = pq_check(j, k, buffs);
+					if (ret != 0) {	// Should pass
+						printf
+						    ("\nfail rand var src, len test %d sources, len=%d\n",
+						     j, k);
+						fail++;
+						return 1;
+					}
+
+					tmp = (char *)buffs[serr];
+					c = tmp[lerr];
+					((char *)buffs[serr])[lerr] = c ^ 1;	// corrupt buffer
+
+					ret = pq_check(j, k, buffs);
+					if (ret == 0) {	// Should fail
+						printf
+						    ("\nfail rand var src, len test corrupted buffer "
+						     "%d sources, len=%d, ret=%d\n", j, k,
+						     ret);
+						fail++;
+						return 1;
+					}
+					((char *)buffs[serr])[lerr] = c;	// un-corrupt buffer
+				}
+			}
+			putchar('.');
+			fflush(0);
+		}
+		k += 16;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 16) {
+		for (j = 0; j < TEST_SOURCES + 2; j++) {
+			rand_buffer(buffs[j], TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+
+		// Test good data
+		ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+		if (ret != 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			fail++;
+			return 1;
+		}
+		// Test bad data
+		for (serr = 0; serr < TEST_SOURCES + 2; serr++) {
+			for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+				c = tmp_buf[serr][lerr];
+				tmp_buf[serr][lerr] = c ^ 1;
+
+				ret =
+				    pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+				if (ret == 0) {
+					printf("fail end test corrupted buffer - "
+					       "offset: %d, len: %d, ret: %d\n", i,
+					       TEST_LEN - i, ret);
+					fail++;
+					return 1;
+				}
+
+				tmp_buf[serr][lerr] = c;
+			}
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (fail == 0)
+		printf("Pass\n");
+
+	return fail;
+
+}
diff --git a/src/isa-l/raid/pq_gen_avx.asm b/src/isa-l/raid/pq_gen_avx.asm
new file mode 100644
index 00000000..43c31a52
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  8*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm14, 6*16
+	save_xmm128	xmm15, 7*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm14, [rsp + 6*16]
+	movdqa	xmm15, [rsp + 7*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xzero xmm14
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx:function
+func(pq_gen_avx)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	vmovdqa	xpoly, [poly]
+	vpxor	xzero, xzero, xzero
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Len points to last block
+
+loop48:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p1 = 0
+	vpxor	xp2, xp2, xp2		;p2 = 0
+	vpxor	xp3, xp3, xp3		;p3 = 0
+	vpxor	xq1, xq1, xq1		;q1 = 0
+	vpxor	xq2, xq2, xq2		;q2 = 0
+	vpxor	xq3, xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpxor	xq2, xq2, xs2		; q2 ^= s2
+	vpxor	xq3, xq3, xs3		; q3 ^= s3
+	vpxor	xp1, xp1, xs1		; p1 ^= s1
+	vpxor	xp2, xp2, xs2		; p2 ^= s2
+	vpxor	xp3, xp3, xs3		; p3 ^= s2
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+	vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	vpaddb	xq1, xq1, xq1		; q1 = q1<<1
+	vpaddb	xq2, xq2, xq2		; q2 = q2<<1
+	vpaddb	xq3, xq3, xq3		; q3 = q3<<1
+	vpxor	xq1, xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	vpxor	xq2, xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	vpxor	xq3, xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q1 ^= 1 * s1[0]
+	vpxor	xp2, xp2, xs2		;p2 ^= s2[0]
+	vpxor	xq2, xq2, xs2		;q2 ^= 1 * s2[0]
+	vpxor	xp3, xp3, xs3		;p3 ^= s3[0]
+	vpxor	xq3, xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+32], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+32], xq3	;Write parity Q3 vector
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p = 0
+	vpxor	xq1, xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpxor	xp1, xp1, xs1		; p ^= s
+	vpaddb	xq1, xq1, xq1		; q = q<<1
+	vpxor	xq1, xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p ^= s[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_avx, 02,   0a,  0039
diff --git a/src/isa-l/raid/pq_gen_avx2.asm b/src/isa-l/raid/pq_gen_avx2.asm
new file mode 100644
index 00000000..96797a62
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx2.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 32 bytes.  Length must be 32 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  8*32 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	;; Until a sav_ymm256 is defined
+	vmovdqu	[rsp + 0*32], ymm6
+	vmovdqu	[rsp + 1*32], ymm7
+	vmovdqu	[rsp + 2*32], ymm8
+	vmovdqu	[rsp + 3*32], ymm9
+	vmovdqu	[rsp + 4*32], ymm10
+	vmovdqu	[rsp + 5*32], ymm11
+	vmovdqu	[rsp + 6*32], ymm14
+	vmovdqu	[rsp + 7*32], ymm15
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqu	ymm6, [rsp + 0*32]
+	vmovdqu	ymm7, [rsp + 1*32]
+	vmovdqu	ymm8, [rsp + 2*32]
+	vmovdqu	ymm9, [rsp + 3*32]
+	vmovdqu	ymm10, [rsp + 4*32]
+	vmovdqu	ymm11, [rsp + 5*32]
+	vmovdqu	ymm14, [rsp + 6*32]
+	vmovdqu	ymm15, [rsp + 7*32]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   ymm0
+%define xq1   ymm1
+%define xtmp1 ymm2
+%define xs1   ymm3
+
+%define xp2   ymm4
+%define xq2   ymm5
+%define xtmp2 ymm6
+%define xs2   ymm7
+
+%define xp3   ymm8
+%define xq3   ymm9
+%define xtmp3 ymm10
+%define xs3   ymm11
+
+%define xzero ymm14
+%define xpoly ymm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx2:function
+func(pq_gen_avx2)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (32-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	vmovdqa	xpoly, [poly]
+	vpxor	xzero, xzero, xzero
+	cmp	len, 96
+	jl	loop32
+
+len_aligned_32bytes:
+	sub	len, 3*32		;Len points to last block
+
+loop96:
+	mov	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+32]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+64]	;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p1 = 0
+	vpxor	xp2, xp2, xp2		;p2 = 0
+	vpxor	xp3, xp3, xp3		;p3 = 0
+	vpxor	xq1, xq1, xq1		;q1 = 0
+	vpxor	xq2, xq2, xq2		;q2 = 0
+	vpxor	xq3, xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpxor	xq2, xq2, xs2		; q2 ^= s2
+	vpxor	xq3, xq3, xs3		; q3 ^= s3
+	vpxor	xp1, xp1, xs1		; p1 ^= s1
+	vpxor	xp2, xp2, xs2		; p2 ^= s2
+	vpxor	xp3, xp3, xs3		; p3 ^= s2
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+	vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+32]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+64]	; Get next vector (source data3)
+	vpaddb	xq1, xq1, xq1		; q1 = q1<<1
+	vpaddb	xq2, xq2, xq2		; q2 = q2<<1
+	vpaddb	xq3, xq3, xq3		; q3 = q3<<1
+	vpxor	xq1, xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	vpxor	xq2, xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	vpxor	xq3, xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q1 ^= 1 * s1[0]
+	vpxor	xp2, xp2, xs2		;p2 ^= s2[0]
+	vpxor	xq2, xq2, xs2		;q2 ^= 1 * s2[0]
+	vpxor	xp3, xp3, xs3		;p3 ^= s3[0]
+	vpxor	xq3, xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+32], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+64], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+32], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+64], xq3	;Write parity Q3 vector
+	add	pos, 3*32
+	cmp	pos, len
+	jle	loop96
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 3*32
+	cmp	pos, len
+	je	return_pass
+
+loop32:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	vpxor	xp1, xp1, xp1		;p = 0
+	vpxor	xq1, xq1, xq1		;q = 0
+
+next_vect32:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxor	xq1, xq1, xs1		; q1 ^= s1
+	vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+	vpxor	xp1, xp1, xs1		; p ^= s
+	vpaddb	xq1, xq1, xq1		; q = q<<1
+	vpxor	xq1, xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect32		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxor	xp1, xp1, xs1		;p ^= s[0] - last source is already loaded
+	vpxor	xq1, xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 32
+	cmp	pos, len
+	jl	loop32
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 32
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core,  ver, snum
+slversion pq_gen_avx2, 04,   03,  0041
diff --git a/src/isa-l/raid/pq_gen_avx512.asm b/src/isa-l/raid/pq_gen_avx512.asm
new file mode 100644
index 00000000..ac7b29f9
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx512.asm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX512
+;;; int pq_gen_avx512(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined.
+;;; Length must be 32 byte multiple.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  4*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqu	[rsp + 0*16], xmm6
+	vmovdqu	[rsp + 1*16], xmm7
+	vmovdqu	[rsp + 2*16], xmm8
+	vmovdqu	[rsp + 3*16], xmm9
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqu	xmm6, [rsp + 0*16]
+	vmovdqu	xmm7, [rsp + 1*16]
+	vmovdqu	xmm8, [rsp + 2*16]
+	vmovdqu	xmm9, [rsp + 3*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec    arg0
+%define len    arg1
+%define ptr    arg3
+%define pos    rax
+
+%define xp1    zmm0
+%define xq1    zmm1
+%define xtmp1  zmm2
+%define xs1    zmm3
+
+%define xp2    zmm4
+%define xq2    zmm5
+%define xtmp2  zmm6
+%define xs2    zmm7
+
+%define xzero  zmm8
+%define xpoly  zmm9
+
+%define xp1y   ymm0
+%define xq1y   ymm1
+%define xtmp1y ymm2
+%define xs1y   ymm3
+%define xzeroy ymm8
+%define xpolyy ymm9
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8		;u8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_avx512:function
+func(pq_gen_avx512)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (32-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	mov	tmp, 0x1d
+	vpbroadcastb xpoly, tmp
+	vpxorq	xzero, xzero, xzero
+	cmp	len, 128
+	jl	loop32
+
+len_aligned_32bytes:
+	sub	len, 2*64		;Len points to last block
+
+loop128:
+	mov	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+64]	;Preload last vector (source)
+	vpxorq	xp1, xp1, xp1		;p1 = 0
+	vpxorq	xp2, xp2, xp2		;p2 = 0
+	vpxorq	xq1, xq1, xq1		;q1 = 0
+	vpxorq	xq2, xq2, xq2		;q2 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxorq	xq1, xq1, xs1		; q1 ^= s1
+	vpxorq	xq2, xq2, xs2		; q2 ^= s2
+	vpxorq	xp1, xp1, xs1		; p1 ^= s1
+	vpxorq	xp2, xp2, xs2		; p2 ^= s2
+	vpcmpb	k1, xq1, xzero, 1
+	vpcmpb	k2, xq2, xzero, 1
+	vpblendmb xtmp1 {k1}, xzero, xpoly
+	vpblendmb xtmp2 {k2}, xzero, xpoly
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+64]	; Get next vector (source data2)
+	vpaddb	xq1, xq1, xq1		; q1 = q1<<1
+	vpaddb	xq2, xq2, xq2		; q2 = q2<<1
+	vpxorq	xq1, xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	vpxorq	xq2, xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxorq	xp1, xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	vpxorq	xq1, xq1, xs1		;q1 ^= 1 * s1[0]
+	vpxorq	xp2, xp2, xs2		;p2 ^= s2[0]
+	vpxorq	xq2, xq2, xs2		;q2 ^= 1 * s2[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+64], xp2	;Write parity P2 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+64], xq2	;Write parity Q2 vector
+	add	pos, 2*64
+	cmp	pos, len
+	jle	loop128
+
+	;; ------------------------------
+	;; Do last 32 or 64 Bytes remaining
+	add	len, 2*64
+	cmp	pos, len
+	je	return_pass
+
+loop32:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1y, [ptr+pos]		;Preload last vector (source)
+	vpxorq	xp1y, xp1y, xp1y	;p = 0
+	vpxorq	xq1y, xq1y, xq1y	;q = 0
+
+next_vect32:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	vpxorq	xq1y, xq1y, xs1y	; q1 ^= s1
+	vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00
+	vpxorq	xp1y, xp1y, xs1y	; p ^= s
+	vpaddb	xq1y, xq1y, xq1y	; q = q<<1
+	vpxorq	xq1y, xq1y, xtmp1y	; q = q<<1 ^ poly_masked
+	XLDR	xs1y, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect32		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	vpxorq	xp1y, xp1y, xs1y	;p ^= s[0] - last source is already loaded
+	vpxorq	xq1y, xq1y, xs1y	;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1y		;Write parity P vector
+	XSTR	[tmp+pos], xq1y		;Write parity Q vector
+	add	pos, 32
+	cmp	pos, len
+	jl	loop32
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/pq_gen_perf.c b/src/isa-l/raid/pq_gen_perf.c
new file mode 100644
index 00000000..194f2604
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_perf.c
@@ -0,0 +1,97 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   800000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#   define TEST_LOOPS  1000
+#  endif
+# endif
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+	int i;
+	void *buffs[TEST_SOURCES + 2];
+	struct perf start, stop;
+
+	printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		int ret;
+		void *buf;
+		ret = posix_memalign(&buf, 64, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Setup data
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	// Warm up
+	pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	perf_stop(&stop);
+	printf("pq_gen" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	return 0;
+}
diff --git a/src/isa-l/raid/pq_gen_sse.asm b/src/isa-l/raid/pq_gen_sse.asm
new file mode 100644
index 00000000..1426f3f5
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define return rax
+ %define stack_size  7*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm15, 6*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm15, [rsp + 6*16]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%define xp3   xmm8
+%define xq3   xmm9
+%define xtmp3 xmm10
+%define xs3   xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+	movdqa	xpoly, [poly]
+	cmp	len, 48
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 48			;Len points to last block
+
+loop48:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	XLDR	xs3, [ptr+pos+32]	;Preload last vector (source)
+	pxor	xp1, xp1		;p1 = 0
+	pxor	xp2, xp2		;p2 = 0
+	pxor	xp3, xp3		;p3 = 0
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xq2, xq2		;q2 = 0
+	pxor	xq3, xq3		;q3 = 0
+
+next_vect:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xq3, xs3		; q3 ^= s3
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xp3, xs3		; p3 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pxor	xtmp3, xtmp3		; xtmp3 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp3, xq3		; xtmp3 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	pand	xtmp3, xpoly		; xtmp3 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	XLDR	xs3, [ptr+pos+32]	; Get next vector (source data3)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	paddb	xq3, xq3		; q3 = q3<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	pxor	xq3, xtmp3		; q3 = q3<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	pxor	xp3, xs3		;p3 ^= s3[0]
+	pxor	xq3, xs3		;q3 ^= 1 * s3[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[ptr+pos+32], xp3	;Write parity P3 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	XSTR	[tmp+pos+32], xq3	;Write parity Q3 vector
+	add	pos, 48
+	cmp	pos, len
+	jle	loop48
+
+	;; ------------------------------
+	;; Do last 16 or 32 Bytes remaining
+	add	len, 48
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*8] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	pxor	xp1, xp1		;p = 0
+	pxor	xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*8] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+8+vec*8]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*8)+vec*8]	;Get address of Q parity vector
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_sse, 00,   09,  0032
diff --git a/src/isa-l/raid/pq_gen_sse_i32.asm b/src/isa-l/raid/pq_gen_sse_i32.asm
new file mode 100644
index 00000000..16093d52
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse_i32.asm
@@ -0,0 +1,264 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array).  Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes.  Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define PS 8
+ %define tmp   r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   edx
+ %define arg1   ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg0, arg(0)
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer?
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1   xmm0
+%define xq1   xmm1
+%define xtmp1 xmm2
+%define xs1   xmm3
+
+%define xp2   xmm4
+%define xq2   xmm5
+%define xtmp2 xmm6
+%define xs2   xmm7
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4			; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global pq_gen_sse:function
+func(pq_gen_sse)
+	FUNC_SAVE
+	sub	vec, 3			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (16-1)		;Check alignment of length
+	jnz	return_fail
+	mov	pos, 0
+%ifidn PS,8
+	movdqa	xpoly, [poly]		;For 64-bit, load poly into high xmm reg
+%endif
+	cmp	len, 32
+	jl	loop16
+
+len_aligned_32bytes:
+	sub	len, 32			;Do end of vec first and run backward
+
+loop32:
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	XLDR	xs2, [ptr+pos+16]	;Preload last vector (source)
+	pxor	xp1, xp1		;p1 = 0
+	pxor	xq1, xq1		;q1 = 0
+	pxor	xp2, xp2		;p2 = 0
+	pxor	xq2, xq2		;q2 = 0
+
+next_vect:
+	sub	tmp, 1		 	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xq2, xs2		; q2 ^= s2
+	pxor	xp1, xs1		; p1 ^= s1
+	pxor	xp2, xs2		; p2 ^= s2
+	pxor	xtmp1, xtmp1		; xtmp1 = 0 - for compare to 0
+	pxor	xtmp2, xtmp2		; xtmp2 = 0
+	pcmpgtb	xtmp1, xq1		; xtmp1 = mask 0xff or 0x00 if bit7 set
+	pcmpgtb	xtmp2, xq2		; xtmp2 = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp1 = poly or 0x00
+	pand	xtmp2, xpoly		; xtmp2 = poly or 0x00
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data1)
+	XLDR	xs2, [ptr+pos+16]	; Get next vector (source data2)
+	paddb	xq1, xq1		; q1 = q1<<1
+	paddb	xq2, xq2		; q2 = q2<<1
+	pxor	xq1, xtmp1		; q1 = q1<<1 ^ poly_masked
+	pxor	xq2, xtmp2		; q2 = q2<<1 ^ poly_masked
+	jg	next_vect		; Loop for each vect except 0
+
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	pxor	xp1, xs1		;p1 ^= s1[0] - last source is already loaded
+	pxor	xq1, xs1		;q1 ^= 1 * s1[0]
+	pxor	xp2, xs2		;p2 ^= s2[0]
+	pxor	xq2, xs2		;q2 ^= 1 * s2[0]
+	XSTR	[ptr+pos], xp1		;Write parity P1 vector
+	XSTR	[ptr+pos+16], xp2	;Write parity P2 vector
+	XSTR	[tmp+pos], xq1		;Write parity Q1 vector
+	XSTR	[tmp+pos+16], xq2	;Write parity Q2 vector
+	add	pos, 32
+	cmp	pos, len
+	jle	loop32
+
+	;; ------------------------------
+	;; Do last 16 Bytes remaining
+	add	len, 32
+	cmp	pos, len
+	je	return_pass
+
+loop16:
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last source pointer
+	mov	tmp, vec		;Set tmp to point back to last vector
+	XLDR	xs1, [ptr+pos]		;Preload last vector (source)
+	pxor	xp1, xp1		;p = 0
+	pxor	xq1, xq1		;q = 0
+
+next_vect16:
+	sub	tmp, 1		  	;Inner loop for each source vector
+	mov 	ptr, [arg2+tmp*PS] 	; get pointer to next vect
+	pxor	xq1, xs1		; q1 ^= s1
+	pxor	xtmp1, xtmp1		; xtmp = 0
+	pcmpgtb	xtmp1, xq1		; xtmp = mask 0xff or 0x00 if bit7 set
+	pand	xtmp1, xpoly		; xtmp = poly or 0x00
+	pxor	xp1, xs1		; p ^= s
+	paddb	xq1, xq1		; q = q<<1
+	pxor	xq1, xtmp1		; q = q<<1 ^ poly_masked
+	XLDR	xs1, [ptr+pos]		; Get next vector (source data)
+	jg	next_vect16		; Loop for each vect except 0
+
+	mov	ptr, [arg2+PS+vec*PS]	;Get address of P parity vector
+	mov	tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+	pxor	xp1, xs1		;p ^= s[0] - last source is already loaded
+	pxor	xq1, xs1		;q ^= 1 * s[0]
+	XSTR	[ptr+pos], xp1		;Write parity P vector
+	XSTR	[tmp+pos], xq1		;Write parity Q vector
+	add	pos, 16
+	cmp	pos, len
+	jl	loop16
+
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;;       func        core, ver, snum
+slversion pq_gen_sse, 00,   08,  0032
diff --git a/src/isa-l/raid/pq_gen_test.c b/src/isa-l/raid/pq_gen_test.c
new file mode 100644
index 00000000..d0844964
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_test.c
@@ -0,0 +1,194 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<limits.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", buf[i++]);
+		if (i % 16 == 0)
+			printf("\n");
+	}
+	printf("\n");
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 2];	// Pointers to src and dest
+	char *tmp_buf[TEST_SOURCES + 2];
+
+	printf("Test pq_gen_test ");
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 2; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES])[i] != 0)
+			fail++;
+	}
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES + 1])[i] != 0)
+			fail++;
+	}
+
+	if (fail > 0) {
+		printf("fail zero test %d\n", fail);
+		return 1;
+	} else
+		putchar('.');
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 2; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+	fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+	if (fail > 0) {
+		int t;
+		printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret);
+		for (t = 0; t < TEST_SOURCES + 2; t++)
+			dump(buffs[t], 15);
+
+		printf(" reference function p,q\n");
+		pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+		for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++)
+			dump(buffs[t], 15);
+
+		return 1;
+	} else
+		putchar('.');
+
+	// Test various number of sources
+	for (j = 4; j <= TEST_SOURCES + 2; j++) {
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		pq_gen(j, TEST_LEN, buffs);
+		fail |= pq_check_base(j, TEST_LEN, buffs);
+
+		if (fail > 0) {
+			printf("fail rand test %d sources\n", j);
+			return 1;
+		} else
+			putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 4; j <= TEST_SOURCES + 2; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			ret = pq_gen(j, k, buffs);
+			fail |= pq_check_base(j, k, buffs);
+
+			if (fail > 0) {
+				printf("fail rand test %d sources, len=%d, fail="
+				       "%d, ret=%d\n", j, k, fail, ret);
+				return 1;
+			}
+		}
+		putchar('.');
+		k += 32;
+	}
+
+	// Test at the end of buffer
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 0; j < (TEST_SOURCES + 2); j++) {
+			rand_buffer(buffs[j], TEST_LEN - k);
+			tmp_buf[j] = (char *)buffs[j] + k;
+		}
+
+		ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+		fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+
+		if (fail > 0) {
+			printf("fail end test - offset: %d, len: %d, fail: %d, "
+			       "ret: %d\n", k, TEST_LEN - k, fail, ret);
+			return 1;
+		}
+
+		putchar('.');
+		fflush(0);
+		k += 32;
+	}
+
+	if (!fail)
+		printf(" done: Pass\n");
+
+	return fail;
+}
diff --git a/src/isa-l/raid/raid_base.c b/src/isa-l/raid/raid_base.c
new file mode 100644
index 00000000..25c19331
--- /dev/null
+++ b/src/isa-l/raid/raid_base.c
@@ -0,0 +1,147 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <limits.h>
+#include <stdint.h>
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+# define notbit0 0xfefefefefefefefeULL
+# define bit7    0x8080808080808080ULL
+# define gf8poly 0x1d1d1d1d1d1d1d1dULL
+#else
+# define notbit0 0xfefefefeUL
+# define bit7    0x80808080UL
+# define gf8poly 0x1d1d1d1dUL
+#endif
+
+int pq_gen_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned long p, q, s;
+	unsigned long **src = (unsigned long **)array;
+	int blocks = len / sizeof(long);
+
+	for (i = 0; i < blocks; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			p ^= s = src[j][i];
+			q = s ^ (((q << 1) & notbit0) ^	// shift each byte
+				 ((((q & bit7) << 1) - ((q & bit7) >> 7))	// mask out bytes
+				  & gf8poly));	// apply poly
+		}
+
+		src[vects - 2][i] = p;	// second to last pointer is p
+		src[vects - 1][i] = q;	// last pointer is q
+	}
+	return 0;
+}
+
+int pq_check_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char p, q, s;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		q = p = src[vects - 3][i];
+
+		for (j = vects - 4; j >= 0; j--) {
+			s = src[j][i];
+			p ^= s;
+
+			// mult by GF{2}
+			q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));
+		}
+
+		if (src[vects - 2][i] != p)	// second to last pointer is p
+			return i | 1;
+		if (src[vects - 1][i] != q)	// last pointer is q
+			return i | 2;
+	}
+	return 0;
+}
+
+int xor_gen_base(int vects, int len, void **array)
+{
+	int i, j;
+	unsigned char parity;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		parity = src[0][i];
+		for (j = 1; j < vects - 1; j++)
+			parity ^= src[j][i];
+
+		src[vects - 1][i] = parity;	// last pointer is dest
+
+	}
+
+	return 0;
+}
+
+int xor_check_base(int vects, int len, void **array)
+{
+	int i, j, fail = 0;
+
+	unsigned char parity;
+	unsigned char **src = (unsigned char **)array;
+
+	for (i = 0; i < len; i++) {
+		parity = 0;
+		for (j = 0; j < vects; j++)
+			parity ^= src[j][i];
+
+		if (parity != 0) {
+			fail = 1;
+			break;
+		}
+	}
+	if (fail && len > 0)
+		return len;
+	return fail;
+}
+
+struct slver {
+	unsigned short snum;
+	unsigned char ver;
+	unsigned char core;
+};
+
+struct slver pq_gen_base_slver_0001012a;
+struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 };
+
+struct slver xor_gen_base_slver_0001012b;
+struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 };
+
+struct slver pq_check_base_slver_0001012c;
+struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 };
+
+struct slver xor_check_base_slver_0001012d;
+struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 };
diff --git a/src/isa-l/raid/raid_base_aliases.c b/src/isa-l/raid/raid_base_aliases.c
new file mode 100644
index 00000000..f81792a0
--- /dev/null
+++ b/src/isa-l/raid/raid_base_aliases.c
@@ -0,0 +1,50 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "raid.h"
+
+int pq_gen(int vects, int len, void **array)
+{
+	return pq_gen_base(vects, len, array);
+}
+
+int pq_check(int vects, int len, void **array)
+{
+	return pq_check_base(vects, len, array);
+}
+
+int xor_gen(int vects, int len, void **array)
+{
+	return xor_gen_base(vects, len, array);
+}
+
+int xor_check(int vects, int len, void **array)
+{
+	return xor_check_base(vects, len, array);
+}
diff --git a/src/isa-l/raid/raid_multibinary.asm b/src/isa-l/raid/raid_multibinary.asm
new file mode 100644
index 00000000..72ef5d40
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT         wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+default rel
+[bits 64]
+
+extern pq_gen_base
+extern pq_gen_sse
+extern pq_gen_avx
+extern pq_gen_avx2
+
+extern xor_gen_base
+extern xor_gen_sse
+extern xor_gen_avx
+
+extern pq_check_base
+extern pq_check_sse
+
+extern xor_check_base
+extern xor_check_sse
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern xor_gen_avx512
+ extern pq_gen_avx512
+%endif
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx, xor_gen_avx512
+ mbin_dispatch_init6 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2, pq_gen_avx512
+%else
+ mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx
+ mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2
+%endif
+
+section .data
+
+xor_check_dispatched:
+	dq      xor_check_mbinit
+pq_check_dispatched:
+	dq      pq_check_mbinit
+
+section .text
+
+;;;;
+; pq_check multibinary function
+;;;;
+global pq_check:function
+pq_check_mbinit:
+	call	pq_check_dispatch_init
+pq_check:
+	jmp     qword [pq_check_dispatched]
+
+pq_check_dispatch_init:
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	push    rsi
+	lea     rsi, [pq_check_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	lea     rbx, [pq_check_sse WRT_OPT]
+	cmovne	rsi, rbx
+
+	mov     [pq_check_dispatched], rsi
+	pop     rsi
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+	ret
+
+
+;;;;
+; xor_check multibinary function
+;;;;
+global xor_check:function
+xor_check_mbinit:
+	call    xor_check_dispatch_init
+xor_check:
+	jmp     qword [xor_check_dispatched]
+
+xor_check_dispatch_init:
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	push    rsi
+	lea     rsi, [xor_check_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	lea     rbx, [xor_check_sse WRT_OPT]
+	cmovne  rsi, rbx
+
+	mov     [xor_check_dispatched], rsi
+	pop     rsi
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+	ret
+
+;;;       func          	core, ver, snum
+slversion xor_gen,		00,   03,  0126
+slversion xor_check,		00,   03,  0127
+slversion pq_gen,		00,   03,  0128
+slversion pq_check,		00,   03,  0129
diff --git a/src/isa-l/raid/raid_multibinary_i32.asm b/src/isa-l/raid/raid_multibinary_i32.asm
new file mode 100644
index 00000000..6da4c9dc
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary_i32.asm
@@ -0,0 +1,58 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT         wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+[bits 32]
+
+extern xor_gen_base
+extern xor_gen_sse
+extern pq_gen_base
+extern pq_gen_sse
+extern xor_check_base
+extern xor_check_sse
+extern pq_check_base
+extern pq_check_sse
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+mbin_interface xor_check
+mbin_interface pq_check
+
+mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_sse, xor_gen_sse
+mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_sse, pq_gen_sse
+mbin_dispatch_init5 xor_check, xor_check_base, xor_check_sse, xor_check_sse, xor_check_sse
+mbin_dispatch_init5 pq_check, pq_check_base, pq_check_sse, pq_check_sse, pq_check_sse
diff --git a/src/isa-l/raid/xor_check_sse.asm b/src/isa-l/raid/xor_check_sse.asm
new file mode 100644
index 00000000..65ae2f77
--- /dev/null
+++ b/src/isa-l/raid/xor_check_sse.asm
@@ -0,0 +1,285 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2  rax
+ %define tmp2.b al
+ %define tmp3  arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define tmp2  rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp   r11
+ %define tmp3  r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   arg(0)
+ %define arg1   ecx
+ %define tmp2   eax
+ %define tmp2.b  al
+ %define tmp3   edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_check_sse:function
+func(xor_check_sse)
+	FUNC_SAVE
+%ifidn PS,8				;64-bit code
+	sub	vec, 1			; Keep as offset to last source
+%else					;32-bit code
+	mov	tmp, arg(0)		; Update vec length arg to last source
+	sub	tmp, 1
+	mov	arg(0), tmp
+%endif
+
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+	mov	tmp, vec		;Preset to last vector
+
+loop128:
+	mov	tmp2, [arg2+tmp*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	xmm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	xmm1, [tmp2+pos+16]	;Keep xor parity in xmm0-7
+	XLDR	xmm2, [tmp2+pos+(2*16)]
+	XLDR	xmm3, [tmp2+pos+(3*16)]
+	XLDR	xmm4, [tmp2+pos+(4*16)]
+	XLDR	xmm5, [tmp2+pos+(5*16)]
+	XLDR	xmm6, [tmp2+pos+(6*16)]
+	XLDR	xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	xorpd	xmm0, [ptr+pos]		;Get next vector (source)
+	xorpd	xmm1, [ptr+pos+16]
+	xorpd	xmm2, [ptr+pos+(2*16)]
+	xorpd	xmm3, [ptr+pos+(3*16)]
+	xorpd	xmm4, [ptr+pos+(4*16)]
+	xorpd	xmm5, [ptr+pos+(5*16)]
+	xorpd	xmm6, [ptr+pos+(6*16)]
+	xorpd	xmm7, [ptr+pos+(7*16)]
+;;;  	prefetch [ptr+pos+(8*16)]
+	jge	next_vect		;Loop for each vect
+
+	;; End of vects, chech that all parity regs = 0
+	mov	tmp, vec		;Back to last vector
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+	por	xmm0, xmm4
+	por	xmm0, xmm5
+	por	xmm0, xmm6
+	por	xmm0, xmm7
+	ptest	xmm0, xmm0
+	jnz	return_fail
+
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+	mov	tmp, vec		;Preset to last vector
+
+loop_1byte:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec		;Back to last vector
+	cmp	tmp2.b, 0
+	jne	return_fail
+	sub	len, 1
+	test	len, (8-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	xor_gen_byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 4-8 at a time
+	mov	tmp, vec		;Preset to last vector
+
+	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_Nbytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_Nbytes		;Loop for each source
+
+	mov	tmp, vec		;Back to last vector
+	cmp	tmp2, 0
+	jne	return_fail
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loopN_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func           core, ver, snum
+slversion xor_check_sse, 00,   03,  0031
+
diff --git a/src/isa-l/raid/xor_check_test.c b/src/isa-l/raid/xor_check_test.c
new file mode 100644
index 00000000..dfb571a6
--- /dev/null
+++ b/src/isa-l/raid/xor_check_test.c
@@ -0,0 +1,280 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 1];
+	char c;
+	int serr, lerr;
+	char *tmp_buf[TEST_SOURCES + 1];
+
+	printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("\nfail zero test %d\n", ret);
+	}
+
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0x7;	// corrupt buffer
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nfail corrupt buffer test %d\n", ret);
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = 0;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources
+	for (j = 0; j < TEST_SOURCES + 1; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			((char *)buffs[j])[i] = 0x5;	// corrupt buffer
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret == 0) {
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = 0;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret != 0) {
+		fail++;
+		printf("fail first rand test %d\n", ret);
+	}
+
+	c = ((char *)(buffs[0]))[TEST_LEN - 2];
+	((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+	ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+	if (ret == 0) {
+		fail++;
+		printf("\nFail corrupt buffer test, passed when should have failed\n");
+	}
+	((char *)(buffs[0]))[TEST_LEN - 2] = c;	// un-corrupt buffer
+
+	// Test corrupted buffer any location on all sources w/ random data
+	for (j = 0; j < TEST_SOURCES + 1; j++) {
+		for (i = TEST_LEN - 1; i >= 0; i--) {
+			// Check it still passes
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret != 0) {	// should pass
+				fail++;
+				printf
+				    ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+				     j, i);
+				return 1;
+			}
+			c = ((char *)buffs[j])[i];
+			((char *)buffs[j])[i] = c ^ 1;	// corrupt buffer
+			ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+			if (ret == 0) {	// Check it now fails
+				fail++;
+				printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+				return 1;
+			}
+			((char *)buffs[j])[i] = c;	// un-corrupt buffer
+		}
+		putchar('.');
+	}
+
+	// Test various number of sources, full length
+	for (j = 3; j <= TEST_SOURCES + 1; j++) {
+		// New random data
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		// Generate xor parity for this number of sources
+		xor_gen_base(j, TEST_LEN, buffs);
+
+		// Set errors up in each source and len position
+		for (i = 0; i < j; i++) {
+			for (k = 0; k < TEST_LEN; k++) {
+				// See if it still passes
+				ret = xor_check(j, TEST_LEN, buffs);
+				if (ret != 0) {	// Should pass
+					printf("\nfail rand test %d sources\n", j);
+					fail++;
+					return 1;
+				}
+
+				c = ((char *)buffs[i])[k];
+				((char *)buffs[i])[k] = c ^ 1;	// corrupt buffer
+
+				ret = xor_check(j, TEST_LEN, buffs);
+				if (ret == 0) {	// Should fail
+					printf
+					    ("\nfail rand test corrupted buffer %d sources\n",
+					     j);
+					fail++;
+					return 1;
+				}
+				((char *)buffs[i])[k] = c;	// un-corrupt buffer
+			}
+		}
+		putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 1;
+	while (k <= TEST_LEN) {
+		for (j = 3; j <= TEST_SOURCES + 1; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			// Generate xor parity for this number of sources
+			xor_gen_base(j, k, buffs);
+
+			// Inject errors at various source and len positions
+			for (lerr = 0; lerr < k; lerr += 10) {
+				for (serr = 0; serr < j; serr++) {
+
+					// See if it still passes
+					ret = xor_check(j, k, buffs);
+					if (ret != 0) {	// Should pass
+						printf("\nfail rand test %d sources\n", j);
+						fail++;
+						return 1;
+					}
+
+					c = ((char *)buffs[serr])[lerr];
+					((char *)buffs[serr])[lerr] = c ^ 1;	// corrupt buffer
+
+					ret = xor_check(j, k, buffs);
+					if (ret == 0) {	// Should fail
+						printf("\nfail rand test corrupted buffer "
+						       "%d sources, len=%d, ret=%d\n", j, k,
+						       ret);
+						fail++;
+						return 1;
+					}
+					((char *)buffs[serr])[lerr] = c;	// un-corrupt buffer
+				}
+			}
+		}
+		putchar('.');
+		fflush(0);
+		k += 1;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 32) {
+		for (j = 0; j < TEST_SOURCES + 1; j++) {
+			rand_buffer(buffs[j], TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+		// Test good data
+		ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+		if (ret != 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			fail++;
+			return 1;
+		}
+		// Test bad data
+		for (serr = 0; serr < TEST_SOURCES + 1; serr++) {
+			for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+				c = tmp_buf[serr][lerr];
+				tmp_buf[serr][lerr] = c ^ 1;
+
+				ret =
+				    xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+				if (ret == 0) {
+					printf("fail end test corrupted buffer - "
+					       "offset: %d, len: %d, ret: %d\n", i,
+					       TEST_LEN - i, ret);
+					fail++;
+					return 1;
+				}
+
+				tmp_buf[serr][lerr] = c;
+			}
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (fail == 0)
+		printf("Pass\n");
+
+	return fail;
+
+}
diff --git a/src/isa-l/raid/xor_example.c b/src/isa-l/raid/xor_example.c
new file mode 100644
index 00000000..d328c314
--- /dev/null
+++ b/src/isa-l/raid/xor_example.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+  Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     16*1024
+
+int main(int argc, char *argv[])
+{
+	int i, j, should_pass, should_fail;
+	void *buffs[TEST_SOURCES + 1];
+
+	printf("XOR example\n");
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		if (posix_memalign(&buf, 16, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	printf("Make random data\n");
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			((char *)buffs[i])[j] = rand();
+
+	printf("Generate xor parity\n");
+	xor_gen_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	printf("Check parity: ");
+	should_pass = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs);
+	printf("%s\n", should_pass == 0 ? "Pass" : "Fail");
+
+	printf("Find corruption: ");
+	((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1;	// flip one bit
+	should_fail = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs);	//recheck
+	printf("%s\n", should_fail != 0 ? "Pass" : "Fail");
+
+	return 0;
+}
diff --git a/src/isa-l/raid/xor_gen_avx.asm b/src/isa-l/raid/xor_gen_avx.asm
new file mode 100644
index 00000000..536ab3e2
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx.asm
@@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX
+;;; int xor_gen_avx(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define func(x) x:
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size  2*32 + 8 	;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqu	[rsp + 0*32], ymm6
+	vmovdqu	[rsp + 1*32], ymm7
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	vmovdqu	ymm6, [rsp + 0*32]
+	vmovdqu	ymm7, [rsp + 1*32]
+	add	rsp, stack_size
+ %endmacro
+
+%endif	;output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global xor_gen_avx:function
+func(xor_gen_avx)
+
+	FUNC_SAVE
+	sub	vec, 2			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+
+loop128:
+	mov	tmp, vec		;Back to last vector
+	mov	tmp2, [arg2+vec*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	ymm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	ymm1, [tmp2+pos+32]	;Keep xor parity in xmm0-7
+	XLDR	ymm2, [tmp2+pos+(2*32)]
+	XLDR	ymm3, [tmp2+pos+(3*32)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	XLDR	ymm4, [ptr+pos]		;Get next vector (source)
+	XLDR	ymm5, [ptr+pos+32]
+	XLDR	ymm6, [ptr+pos+(2*32)]
+	XLDR	ymm7, [ptr+pos+(3*32)]
+	vxorpd	ymm0, ymm0, ymm4	;Add to xor parity
+	vxorpd	ymm1, ymm1, ymm5
+	vxorpd	ymm2, ymm2, ymm6
+	vxorpd	ymm3, ymm3, ymm7
+	jge	next_vect		;Loop for each source
+
+	mov	ptr, [arg2+PS+vec*PS]	;Address of parity vector
+	XSTR	[ptr+pos], ymm0		;Write parity xor vector
+	XSTR	[ptr+pos+(1*32)], ymm1
+	XSTR	[ptr+pos+(2*32)], ymm2
+	XSTR	[ptr+pos+(3*32)], ymm3
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-1], tmp2.b 	;Write parity
+	sub	len, 1
+	test	len, (PS-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	loop_1byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 8 at a time
+
+	;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_8bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_8bytes		;Loop for each source
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-PS], tmp2	;Write parity
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loop8_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func         core, ver, snum
+slversion xor_gen_avx, 02,   05,  0037
+
diff --git a/src/isa-l/raid/xor_gen_avx512.asm b/src/isa-l/raid/xor_gen_avx512.asm
new file mode 100644
index 00000000..6892f85c
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx512.asm
@@ -0,0 +1,217 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX512
+;;; int xor_gen_avx512(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp3  arg4
+ %define func(x) x:
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmp3  r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size  2*16 + 8 	;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqu	[rsp + 0*16], xmm6
+	vmovdqu	[rsp + 1*16], xmm7
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	vmovdqu	xmm6, [rsp + 0*16]
+	vmovdqu	xmm7, [rsp + 1*316]
+	add	rsp, stack_size
+ %endmacro
+
+%endif	;output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global xor_gen_avx512:function
+func(xor_gen_avx512)
+	FUNC_SAVE
+	sub	vec, 2			;Keep as offset to last source
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+
+loop128:
+	mov	tmp, vec		;Back to last vector
+	mov	tmp2, [arg2+vec*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	zmm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	zmm1, [tmp2+pos+64]	;Keep xor parity in xmm0-7
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	XLDR	zmm4, [ptr+pos]		;Get next vector (source)
+	XLDR	zmm5, [ptr+pos+64]
+	vpxorq	zmm0, zmm0, zmm4	;Add to xor parity
+	vpxorq	zmm1, zmm1, zmm5
+	jge	next_vect		;Loop for each source
+
+	mov	ptr, [arg2+PS+vec*PS]	;Address of parity vector
+	XSTR	[ptr+pos], zmm0		;Write parity xor vector
+	XSTR	[ptr+pos+64], zmm1
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-1], tmp2.b 	;Write parity
+	sub	len, 1
+	test	len, (PS-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	loop_1byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 8 at a time
+
+	;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+	mov	tmp, vec		;Back to last vector
+	mov 	ptr, [arg2+vec*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_8bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_8bytes	;Loop for each source
+
+	mov	tmp, vec
+	add	tmp, 1		  	;Add back to point to last vec
+	mov	ptr, [arg2+tmp*PS]
+	mov	[ptr+len-PS], tmp2	;Write parity
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loop8_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/xor_gen_perf.c b/src/isa-l/raid/xor_gen_perf.c
new file mode 100644
index 00000000..25b33cb6
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_perf.c
@@ -0,0 +1,98 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same 
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   2000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     GT_L3_CACHE / TEST_SOURCES
+# define TEST_LOOPS   1000
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+	int i, ret, fail = 0;
+	void **buffs;
+	void *buff;
+	struct perf start, stop;
+
+	printf("Test xor_gen_perf\n");
+
+	ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	buffs = buff;
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 64, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Setup data
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	// Warm up
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+	perf_stop(&stop);
+	printf("xor_gen" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	return fail;
+}
diff --git a/src/isa-l/raid/xor_gen_sse.asm b/src/isa-l/raid/xor_gen_sse.asm
new file mode 100644
index 00000000..2fd6faeb
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_sse.asm
@@ -0,0 +1,284 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array).  Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes.  Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2  rax
+ %define tmp2.b al
+ %define tmp3  arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define tmp2  rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp   r11
+ %define tmp3  r10
+ %define stack_size  2*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	add	rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0   arg(0)
+ %define arg1   ecx
+ %define tmp2   eax
+ %define tmp2.b  al
+ %define tmp3   edx
+ %define return eax
+ %define PS 4
+ %define func(x) x:
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2  edi	; must sav/restore
+ %define arg3  esi
+ %define tmp   ebx
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+	mov	arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp	;if has frame pointer
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+
+%define vec arg0
+%define	len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8			; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+global xor_gen_sse:function
+func(xor_gen_sse)
+	FUNC_SAVE
+%ifidn PS,8				;64-bit code
+	sub	vec, 2			; Keep as offset to last source
+%else					;32-bit code
+	mov	tmp, arg(0)		; Update vec length arg to last source
+	sub	tmp, 2
+	mov	arg(0), tmp
+%endif
+
+	jng	return_fail		;Must have at least 2 sources
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;Check alignment of length
+	jnz	len_not_aligned
+
+
+len_aligned_128bytes:
+	sub	len, 128
+	mov	pos, 0
+	mov	tmp, vec		;Preset to last vector
+
+loop128:
+	mov	tmp2, [arg2+tmp*PS]	;Fetch last pointer in array
+	sub	tmp, 1			;Next vect
+	XLDR	xmm0, [tmp2+pos]	;Start with end of array in last vector
+	XLDR	xmm1, [tmp2+pos+16]	;Keep xor parity in xmm0-7
+	XLDR	xmm2, [tmp2+pos+(2*16)]
+	XLDR	xmm3, [tmp2+pos+(3*16)]
+	XLDR	xmm4, [tmp2+pos+(4*16)]
+	XLDR	xmm5, [tmp2+pos+(5*16)]
+	XLDR	xmm6, [tmp2+pos+(6*16)]
+	XLDR	xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+	mov 	ptr, [arg2+tmp*PS]
+	sub	tmp, 1
+	xorpd	xmm0, [ptr+pos]		;Get next vector (source)
+	xorpd	xmm1, [ptr+pos+16]
+	xorpd	xmm2, [ptr+pos+(2*16)]
+	xorpd	xmm3, [ptr+pos+(3*16)]
+	xorpd	xmm4, [ptr+pos+(4*16)]
+	xorpd	xmm5, [ptr+pos+(5*16)]
+	xorpd	xmm6, [ptr+pos+(6*16)]
+	xorpd	xmm7, [ptr+pos+(7*16)]
+;;;  	prefetch [ptr+pos+(8*16)]
+	jge	next_vect		;Loop for each vect
+
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Address of parity vector
+	XSTR	[ptr+pos], xmm0		;Write parity xor vector
+	XSTR	[ptr+pos+(1*16)], xmm1
+	XSTR	[ptr+pos+(2*16)], xmm2
+	XSTR	[ptr+pos+(3*16)], xmm3
+	XSTR	[ptr+pos+(4*16)], xmm4
+	XSTR	[ptr+pos+(5*16)], xmm5
+	XSTR	[ptr+pos+(6*16)], xmm6
+	XSTR	[ptr+pos+(7*16)], xmm7
+	add	pos, 128
+	cmp	pos, len
+	jle	loop128
+
+return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+	mov	tmp, vec		;Preset to last vector
+
+loop_1byte:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2.b, [ptr+len-1]	;Get array n
+	sub	tmp, 1
+nextvect_1byte:
+	mov 	ptr, [arg2+tmp*PS]
+	xor	tmp2.b, [ptr+len-1]
+	sub	tmp, 1
+	jge	nextvect_1byte
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Get last vec
+	mov	[ptr+len-1], tmp2.b 	;Write parity
+	sub	len, 1
+	test	len, (8-1)
+	jnz	loop_1byte
+
+	cmp	len, 0
+	je	return_pass
+	test	len, (128-1)		;If not 0 and 128bit aligned
+	jz	len_aligned_128bytes	; then do aligned case. len = y * 128
+
+	;; else we are 8-byte aligned so fall through to recheck
+
+
+	;; Unaligned length cases
+len_not_aligned:
+	test	len, (PS-1)
+	jne	xor_gen_byte
+	mov	tmp3, len
+	and	tmp3, (128-1)		;Do the unaligned bytes 4-8 at a time
+	mov	tmp, vec		;Preset to last vector
+
+	;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Fetch last pointer in array
+	mov	tmp2, [ptr+len-PS]	;Get array n
+	sub	tmp, 1
+nextvect_Nbytes:
+	mov 	ptr, [arg2+tmp*PS] 	;Get pointer to next vector
+	xor	tmp2, [ptr+len-PS]
+	sub	tmp, 1
+	jge	nextvect_Nbytes		;Loop for each source
+
+	mov	tmp, vec		;Back to last vector
+	mov	ptr, [arg2+PS+tmp*PS]	;Get last vec
+	mov	[ptr+len-PS], tmp2 	;Write parity
+	sub	len, PS
+	sub	tmp3, PS
+	jg	loopN_bytes
+
+	cmp	len, 128		;Now len is aligned to 128B
+	jge	len_aligned_128bytes	;We can do the rest aligned
+
+	cmp	len, 0
+	je	return_pass
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func         core, ver, snum
+slversion xor_gen_sse, 00,   0c,  0030
+
diff --git a/src/isa-l/raid/xor_gen_test.c b/src/isa-l/raid/xor_gen_test.c
new file mode 100644
index 00000000..f158f94c
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_test.c
@@ -0,0 +1,165 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN     1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, k, ret, fail = 0;
+	void *buffs[TEST_SOURCES + 1];
+	char *tmp_buf[TEST_SOURCES + 1];
+
+	printf("Test xor_gen_test ");
+
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES + 1; i++) {
+		void *buf;
+		ret = posix_memalign(&buf, 32, TEST_LEN);
+		if (ret) {
+			printf("alloc error: Fail");
+			return 1;
+		}
+		buffs[i] = buf;
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	for (i = 0; i < TEST_LEN; i++) {
+		if (((char *)buffs[TEST_SOURCES])[i] != 0)
+			fail++;
+	}
+
+	if (fail > 0) {
+		printf("fail zero test");
+		return 1;
+	} else
+		putchar('.');
+
+	// Test rand1
+	for (i = 0; i < TEST_SOURCES + 1; i++)
+		rand_buffer(buffs[i], TEST_LEN);
+
+	xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+	if (fail > 0) {
+		printf("fail rand test %d\n", fail);
+		return 1;
+	} else
+		putchar('.');
+
+	// Test various number of sources
+	for (j = 3; j <= TEST_SOURCES + 1; j++) {
+		for (i = 0; i < j; i++)
+			rand_buffer(buffs[i], TEST_LEN);
+
+		xor_gen(j, TEST_LEN, buffs);
+		fail |= xor_check_base(j, TEST_LEN, buffs);
+
+		if (fail > 0) {
+			printf("fail rand test %d sources\n", j);
+			return 1;
+		} else
+			putchar('.');
+	}
+
+	fflush(0);
+
+	// Test various number of sources and len
+	k = 0;
+	while (k <= TEST_LEN) {
+		for (j = 3; j <= TEST_SOURCES + 1; j++) {
+			for (i = 0; i < j; i++)
+				rand_buffer(buffs[i], k);
+
+			xor_gen(j, k, buffs);
+			fail |= xor_check_base(j, k, buffs);
+
+			if (fail > 0) {
+				printf("fail rand test %d sources, len=%d, ret=%d\n", j, k,
+				       fail);
+				return 1;
+			}
+		}
+		putchar('.');
+		k += 1;
+	}
+
+	// Test at the end of buffer
+	for (i = 0; i < TEST_LEN; i += 32) {
+		for (j = 0; j < TEST_SOURCES + 1; j++) {
+			rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i);
+			tmp_buf[j] = (char *)buffs[j] + i;
+		}
+
+		xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+		fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+		if (fail > 0) {
+			printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+			return 1;
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	if (!fail)
+		printf(" done: Pass\n");
+
+	return fail;
+}