summaryrefslogtreecommitdiffstats
path: root/src/isa-l/raid
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/isa-l/raid
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/raid')
-rw-r--r--src/isa-l/raid/Makefile.am67
-rw-r--r--src/isa-l/raid/aarch64/Makefile.am36
-rw-r--r--src/isa-l/raid/aarch64/pq_check_neon.S341
-rw-r--r--src/isa-l/raid/aarch64/pq_gen_neon.S282
-rw-r--r--src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c61
-rw-r--r--src/isa-l/raid/aarch64/raid_multibinary_arm.S36
-rw-r--r--src/isa-l/raid/aarch64/xor_check_neon.S271
-rw-r--r--src/isa-l/raid/aarch64/xor_gen_neon.S264
-rw-r--r--src/isa-l/raid/pq_check_sse.asm277
-rw-r--r--src/isa-l/raid/pq_check_sse_i32.asm282
-rw-r--r--src/isa-l/raid/pq_check_test.c304
-rw-r--r--src/isa-l/raid/pq_gen_avx.asm254
-rw-r--r--src/isa-l/raid/pq_gen_avx2.asm256
-rw-r--r--src/isa-l/raid/pq_gen_avx512.asm235
-rw-r--r--src/isa-l/raid/pq_gen_perf.c88
-rw-r--r--src/isa-l/raid/pq_gen_sse.asm258
-rw-r--r--src/isa-l/raid/pq_gen_sse_i32.asm264
-rw-r--r--src/isa-l/raid/pq_gen_test.c194
-rw-r--r--src/isa-l/raid/raid_base.c147
-rw-r--r--src/isa-l/raid/raid_base_aliases.c50
-rw-r--r--src/isa-l/raid/raid_multibinary.asm143
-rw-r--r--src/isa-l/raid/raid_multibinary_i32.asm52
-rw-r--r--src/isa-l/raid/xor_check_sse.asm285
-rw-r--r--src/isa-l/raid/xor_check_test.c280
-rw-r--r--src/isa-l/raid/xor_example.c70
-rw-r--r--src/isa-l/raid/xor_gen_avx.asm228
-rw-r--r--src/isa-l/raid/xor_gen_avx512.asm217
-rw-r--r--src/isa-l/raid/xor_gen_perf.c90
-rw-r--r--src/isa-l/raid/xor_gen_sse.asm284
-rw-r--r--src/isa-l/raid/xor_gen_test.c165
30 files changed, 5781 insertions, 0 deletions
diff --git a/src/isa-l/raid/Makefile.am b/src/isa-l/raid/Makefile.am
new file mode 100644
index 000000000..5f98668d5
--- /dev/null
+++ b/src/isa-l/raid/Makefile.am
@@ -0,0 +1,67 @@
+########################################################################
+# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+include raid/aarch64/Makefile.am
+
+lsrc += raid/raid_base.c
+
+lsrc_base_aliases += raid/raid_base_aliases.c
+lsrc_ppc64le += raid/raid_base_aliases.c
+
+lsrc_x86_64 += \
+ raid/xor_gen_sse.asm \
+ raid/pq_gen_sse.asm \
+ raid/xor_check_sse.asm \
+ raid/pq_check_sse.asm \
+ raid/pq_gen_avx.asm \
+ raid/xor_gen_avx.asm \
+ raid/pq_gen_avx2.asm \
+ raid/xor_gen_avx512.asm \
+ raid/pq_gen_avx512.asm \
+ raid/raid_multibinary.asm
+
+lsrc_x86_32 += \
+ raid/xor_gen_sse.asm \
+ raid/pq_gen_sse_i32.asm \
+ raid/xor_check_sse.asm \
+ raid/pq_check_sse_i32.asm \
+ raid/raid_multibinary_i32.asm
+
+
+extern_hdrs += include/raid.h
+
+other_src += include/test.h include/types.h
+
+check_tests += raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test
+
+perf_tests += raid/xor_gen_perf raid/pq_gen_perf
+
+examples += raid/xor_example
+
+lsrc32 += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c
diff --git a/src/isa-l/raid/aarch64/Makefile.am b/src/isa-l/raid/aarch64/Makefile.am
new file mode 100644
index 000000000..d08c8d67a
--- /dev/null
+++ b/src/isa-l/raid/aarch64/Makefile.am
@@ -0,0 +1,36 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+lsrc_aarch64 += \
+ raid/aarch64/xor_gen_neon.S \
+ raid/aarch64/pq_gen_neon.S \
+ raid/aarch64/xor_check_neon.S \
+ raid/aarch64/pq_check_neon.S \
+ raid/aarch64/raid_multibinary_arm.S \
+ raid/aarch64/raid_aarch64_dispatcher.c
diff --git a/src/isa-l/raid/aarch64/pq_check_neon.S b/src/isa-l/raid/aarch64/pq_check_neon.S
new file mode 100644
index 000000000..55ad79829
--- /dev/null
+++ b/src/isa-l/raid/aarch64/pq_check_neon.S
@@ -0,0 +1,341 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_check_neon
+.type pq_check_neon, %function
+
+/* int pq_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 3 */
+x_vects .req x0
+w_len .req w1 /* MUST be 16x bytes */
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dst_p .req x3
+x_dst_q .req x4
+x_dst_q_end .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_src_ptr_end .req x9
+x_src_last .req x10
+x_srcn .req x11
+w_min .req w12
+/* vectors */
+/* v0 ~ v7 : temporary p */
+/* v8 ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0 .req v24
+v_mask1 .req v25
+v_mask2 .req v26
+v_mask3 .req v27
+v_gf8poly .req v28
+v_0x80 .req v29
+
+/*
+ * src_ptr_end -->
+ * -------+----------+
+ * . | src[0] |
+ * . +----------+ +------------------+
+ * src_ptr --> | src[1] | - srcn -> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-4] |
+ * -------+----------+ src_last +------------------+
+ * src --> | src[v-3] | ---------> | buffer |
+ * +----------+ +------------------+
+ * | src[v-2] | - dst_p -> | buffer |
+ * +----------+ +------------------+
+ * | src[v-1] | - dst_q -> | buffer | dst_q_end
+ * +----------+ +------------------+
+ */
+
+pq_check_neon:
+ sub x_src_ptr_end, x_src, #8
+
+ sub w_vects, w_vects, #3
+ add x_src, x_src, x_vects, lsl #3
+
+ ldr x_src_last, [x_src]
+ ldp x_dst_p, x_dst_q, [x_src, #8]
+
+ add x_dst_q_end, x_dst_q, x_len
+
+ mov w_min, #-1
+ mov w_col, #0
+ movi v_gf8poly.16b, #0x1D
+ movi v_0x80.16b, #0x80
+
+.Lloop128_init:
+ /* less than 128 byts? */
+ cmp w_len, #128
+ blo .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_dst_q_end, x_dst_q_end, #128
+
+ /* batch process (vects-2)*128 bytes */
+ /* v0~v7: p; v8~v15: q; v16~v23: in */
+.Lloop128:
+ ldr q0, [x_src_last, #16*0]
+ ldr q1, [x_src_last, #16*1]
+ ldr q2, [x_src_last, #16*2]
+ ldr q3, [x_src_last, #16*3]
+ ldr q4, [x_src_last, #16*4]
+ ldr q5, [x_src_last, #16*5]
+ ldr q6, [x_src_last, #16*6]
+ ldr q7, [x_src_last, #16*7]
+ add x_src_last, x_src_last, #128
+
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ mov v11.16b, v3.16b
+ mov v12.16b, v4.16b
+ mov v13.16b, v5.16b
+ mov v14.16b, v6.16b
+ mov v15.16b, v7.16b
+
+ cbz w_vects, .Lloop128_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop128_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ cmhs v_mask0.16b, v8.16b, v_0x80.16b
+ cmhs v_mask1.16b, v9.16b, v_0x80.16b
+ cmhs v_mask2.16b, v10.16b, v_0x80.16b
+ cmhs v_mask3.16b, v11.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v8.16b, v8.16b, #1
+ shl v9.16b, v9.16b, #1
+ shl v10.16b, v10.16b, #1
+ shl v11.16b, v11.16b, #1
+ eor v8.16b, v8.16b, v_mask0.16b
+ eor v9.16b, v9.16b, v_mask1.16b
+ eor v10.16b, v10.16b, v_mask2.16b
+ eor v11.16b, v11.16b, v_mask3.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v9.16b, v9.16b, v17.16b
+ eor v10.16b, v10.16b, v18.16b
+ eor v11.16b, v11.16b, v19.16b
+
+ cmhs v_mask0.16b, v12.16b, v_0x80.16b
+ cmhs v_mask1.16b, v13.16b, v_0x80.16b
+ cmhs v_mask2.16b, v14.16b, v_0x80.16b
+ cmhs v_mask3.16b, v15.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v12.16b, v12.16b, #1
+ shl v13.16b, v13.16b, #1
+ shl v14.16b, v14.16b, #1
+ shl v15.16b, v15.16b, #1
+ eor v12.16b, v12.16b, v_mask0.16b
+ eor v13.16b, v13.16b, v_mask1.16b
+ eor v14.16b, v14.16b, v_mask2.16b
+ eor v15.16b, v15.16b, v_mask3.16b
+ eor v12.16b, v12.16b, v20.16b
+ eor v13.16b, v13.16b, v21.16b
+ eor v14.16b, v14.16b, v22.16b
+ eor v15.16b, v15.16b, v23.16b
+
+ bne .Lloop128_vects
+
+.Lloop128_vects_end:
+ /* v16~v23: true p, q */
+ ldr q16, [x_dst_p, #16*0]
+ ldr q17, [x_dst_p, #16*1]
+ ldr q18, [x_dst_p, #16*2]
+ ldr q19, [x_dst_p, #16*3]
+ ldr q20, [x_dst_p, #16*4]
+ ldr q21, [x_dst_p, #16*5]
+ ldr q22, [x_dst_p, #16*6]
+ ldr q23, [x_dst_p, #16*7]
+
+ cmeq v0.16b, v0.16b, v16.16b
+ cmeq v1.16b, v1.16b, v17.16b
+ cmeq v2.16b, v2.16b, v18.16b
+ cmeq v3.16b, v3.16b, v19.16b
+ cmeq v4.16b, v4.16b, v20.16b
+ cmeq v5.16b, v5.16b, v21.16b
+ cmeq v6.16b, v6.16b, v22.16b
+ cmeq v7.16b, v7.16b, v23.16b
+
+ ldr q16, [x_dst_q, #16*0]
+ ldr q17, [x_dst_q, #16*1]
+ ldr q18, [x_dst_q, #16*2]
+ ldr q19, [x_dst_q, #16*3]
+ ldr q20, [x_dst_q, #16*4]
+ ldr q21, [x_dst_q, #16*5]
+ ldr q22, [x_dst_q, #16*6]
+ ldr q23, [x_dst_q, #16*7]
+
+ and v0.16b, v0.16b, v1.16b
+ and v2.16b, v2.16b, v3.16b
+ and v4.16b, v4.16b, v5.16b
+ and v6.16b, v6.16b, v7.16b
+ and v0.16b, v0.16b, v2.16b
+ and v4.16b, v4.16b, v6.16b
+ and v0.16b, v0.16b, v4.16b
+
+ cmeq v8.16b, v8.16b, v16.16b
+ cmeq v9.16b, v9.16b, v17.16b
+ cmeq v10.16b, v10.16b, v18.16b
+ cmeq v11.16b, v11.16b, v19.16b
+ cmeq v12.16b, v12.16b, v20.16b
+ cmeq v13.16b, v13.16b, v21.16b
+ cmeq v14.16b, v14.16b, v22.16b
+ cmeq v15.16b, v15.16b, v23.16b
+
+ and v8.16b, v8.16b, v9.16b
+ and v10.16b, v10.16b, v11.16b
+ and v12.16b, v12.16b, v13.16b
+ and v14.16b, v14.16b, v15.16b
+ and v8.16b, v8.16b, v10.16b
+ and v12.16b, v12.16b, v14.16b
+ and v8.16b, v8.16b, v12.16b
+
+ and v0.16b, v0.16b, v8.16b
+
+ uminv b0, v0.16b
+ umov w_min, v0.b[0]
+ cbz w_min, .Lloop128_end
+
+ add x_dst_p, x_dst_p, #128
+ add x_dst_q, x_dst_q, #128
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #128
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ cbz w_min, .Lerror
+
+ add x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+ tst w_len, #0x7F
+ beq .Lloop16_end
+ sub x_dst_q_end, x_dst_q_end, #16
+
+ /* batch process (vects-2)*16 bytes */
+ /* v0: p; v1: q; v2: in; v3: mask */
+.Lloop16:
+ ldr q0, [x_src_last], #16
+ mov v1.16b, v0.16b
+
+ cbz w_vects, .Lloop16_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ ldr q2, [x_srcn, x_col]
+ cmp x_src_ptr, x_src_ptr_end
+
+ eor v0.16b, v0.16b, v2.16b
+
+ cmhs v3.16b, v1.16b, v_0x80.16b
+ and v3.16b, v3.16b, v_gf8poly.16b
+
+ shl v1.16b, v1.16b, #1
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v3.16b
+
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ /* v4: true p; v5: true q */
+ ldr q4, [x_dst_p], #16
+ ldr q5, [x_dst_q], #16
+ cmp x_dst_q, x_dst_q_end
+
+ cmeq v0.16b, v0.16b, v4.16b
+ cmeq v1.16b, v1.16b, v5.16b
+ and v0.16b, v0.16b, v1.16b
+
+ uminv b0, v0.16b
+ umov w_min, v0.b[0]
+ cbz w_min, .Lerror
+
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ mov w_ret, #0
+ ret
+
+.Lerror:
+ mov w_ret, #1
+ ret
diff --git a/src/isa-l/raid/aarch64/pq_gen_neon.S b/src/isa-l/raid/aarch64/pq_gen_neon.S
new file mode 100644
index 000000000..f60ad1211
--- /dev/null
+++ b/src/isa-l/raid/aarch64/pq_gen_neon.S
@@ -0,0 +1,282 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_gen_neon
+.type pq_gen_neon, %function
+
+/* int pq_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 3 */
+x_vects .req x0
+w_len .req w1 /* MUST be 16x bytes */
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dst_p .req x3
+x_dst_q .req x4
+x_dst_q_end .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_src_ptr_end .req x9
+x_src_last .req x10
+x_srcn .req x11
+/* vectors */
+/* v0 ~ v7 : temporary p */
+/* v8 ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0 .req v24
+v_mask1 .req v25
+v_mask2 .req v26
+v_mask3 .req v27
+v_gf8poly .req v28
+v_0x80 .req v29
+
+/*
+ * src_ptr_end -->
+ * -------+----------+
+ * . | src[0] |
+ * . +----------+ +------------------+
+ * src_ptr --> | src[1] | - srcn -> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-4] |
+ * -------+----------+ src_last +------------------+
+ * src --> | src[v-3] | ---------> | buffer |
+ * +----------+ +------------------+
+ * | src[v-2] | - dst_p -> | buffer |
+ * +----------+ +------------------+
+ * | src[v-1] | - dst_q -> | buffer | dst_q_end
+ * +----------+ +------------------+
+ */
+
+pq_gen_neon:
+ sub x_src_ptr_end, x_src, #8
+
+ sub w_vects, w_vects, #3
+ add x_src, x_src, x_vects, lsl #3
+
+ ldr x_src_last, [x_src]
+ ldp x_dst_p, x_dst_q, [x_src, #8]
+
+ add x_dst_q_end, x_dst_q, x_len
+
+ mov w_col, #0
+ movi v_gf8poly.16b, #0x1D
+ movi v_0x80.16b, #0x80
+
+.Lloop128_init:
+ /* less than 128 byts? */
+ cmp w_len, #128
+ blo .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_dst_q_end, x_dst_q_end, #128
+
+ /* batch process (vects-2)*128 bytes */
+ /* v0~v7: p; v8~v15: q; v16~v23: in */
+.Lloop128:
+ ldr q0, [x_src_last, #16*0]
+ ldr q1, [x_src_last, #16*1]
+ ldr q2, [x_src_last, #16*2]
+ ldr q3, [x_src_last, #16*3]
+ ldr q4, [x_src_last, #16*4]
+ ldr q5, [x_src_last, #16*5]
+ ldr q6, [x_src_last, #16*6]
+ ldr q7, [x_src_last, #16*7]
+ add x_src_last, x_src_last, #128
+
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ mov v11.16b, v3.16b
+ mov v12.16b, v4.16b
+ mov v13.16b, v5.16b
+ mov v14.16b, v6.16b
+ mov v15.16b, v7.16b
+
+ cbz w_vects, .Lloop128_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop128_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ cmhs v_mask0.16b, v8.16b, v_0x80.16b
+ cmhs v_mask1.16b, v9.16b, v_0x80.16b
+ cmhs v_mask2.16b, v10.16b, v_0x80.16b
+ cmhs v_mask3.16b, v11.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v8.16b, v8.16b, #1
+ shl v9.16b, v9.16b, #1
+ shl v10.16b, v10.16b, #1
+ shl v11.16b, v11.16b, #1
+ eor v8.16b, v8.16b, v_mask0.16b
+ eor v9.16b, v9.16b, v_mask1.16b
+ eor v10.16b, v10.16b, v_mask2.16b
+ eor v11.16b, v11.16b, v_mask3.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v9.16b, v9.16b, v17.16b
+ eor v10.16b, v10.16b, v18.16b
+ eor v11.16b, v11.16b, v19.16b
+
+ cmhs v_mask0.16b, v12.16b, v_0x80.16b
+ cmhs v_mask1.16b, v13.16b, v_0x80.16b
+ cmhs v_mask2.16b, v14.16b, v_0x80.16b
+ cmhs v_mask3.16b, v15.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v12.16b, v12.16b, #1
+ shl v13.16b, v13.16b, #1
+ shl v14.16b, v14.16b, #1
+ shl v15.16b, v15.16b, #1
+ eor v12.16b, v12.16b, v_mask0.16b
+ eor v13.16b, v13.16b, v_mask1.16b
+ eor v14.16b, v14.16b, v_mask2.16b
+ eor v15.16b, v15.16b, v_mask3.16b
+ eor v12.16b, v12.16b, v20.16b
+ eor v13.16b, v13.16b, v21.16b
+ eor v14.16b, v14.16b, v22.16b
+ eor v15.16b, v15.16b, v23.16b
+
+ bne .Lloop128_vects
+
+.Lloop128_vects_end:
+ str q0, [x_dst_p, #16*0]
+ str q1, [x_dst_p, #16*1]
+ str q2, [x_dst_p, #16*2]
+ str q3, [x_dst_p, #16*3]
+ str q4, [x_dst_p, #16*4]
+ str q5, [x_dst_p, #16*5]
+ str q6, [x_dst_p, #16*6]
+ str q7, [x_dst_p, #16*7]
+
+ str q8, [x_dst_q, #16*0]
+ str q9, [x_dst_q, #16*1]
+ str q10, [x_dst_q, #16*2]
+ str q11, [x_dst_q, #16*3]
+ str q12, [x_dst_q, #16*4]
+ str q13, [x_dst_q, #16*5]
+ str q14, [x_dst_q, #16*6]
+ str q15, [x_dst_q, #16*7]
+
+ add x_dst_p, x_dst_p, #128
+ add x_dst_q, x_dst_q, #128
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #128
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+ tst w_len, #0x7F
+ beq .Lloop16_end
+ sub x_dst_q_end, x_dst_q_end, #16
+
+ /* batch process (vects-2)*16 bytes */
+ /* v0: p; v1: q; v2: in; v3: mask */
+.Lloop16:
+ ldr q0, [x_src_last], #16
+ mov v1.16b, v0.16b
+
+ cbz w_vects, .Lloop16_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ ldr q2, [x_srcn, x_col]
+ cmp x_src_ptr, x_src_ptr_end
+
+ eor v0.16b, v0.16b, v2.16b
+
+ cmhs v3.16b, v1.16b, v_0x80.16b
+ and v3.16b, v3.16b, v_gf8poly.16b
+
+ shl v1.16b, v1.16b, #1
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v3.16b
+
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q0, [x_dst_p], #16
+ str q1, [x_dst_q], #16
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ mov w_ret, #0
+ ret
diff --git a/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
new file mode 100644
index 000000000..c81bd8c98
--- /dev/null
+++ b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
@@ -0,0 +1,61 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(xor_gen)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(xor_gen_neon);
+ return PROVIDER_BASIC(xor_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(xor_check)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(xor_check_neon);
+ return PROVIDER_BASIC(xor_check);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_gen)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(pq_gen_neon);
+ return PROVIDER_BASIC(pq_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_check)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(pq_check_neon);
+ return PROVIDER_BASIC(pq_check);
+
+}
diff --git a/src/isa-l/raid/aarch64/raid_multibinary_arm.S b/src/isa-l/raid/aarch64/raid_multibinary_arm.S
new file mode 100644
index 000000000..0316239ec
--- /dev/null
+++ b/src/isa-l/raid/aarch64/raid_multibinary_arm.S
@@ -0,0 +1,36 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface xor_gen
+mbin_interface xor_check
+mbin_interface pq_gen
+mbin_interface pq_check
diff --git a/src/isa-l/raid/aarch64/xor_check_neon.S b/src/isa-l/raid/aarch64/xor_check_neon.S
new file mode 100644
index 000000000..95cb7d1d1
--- /dev/null
+++ b/src/isa-l/raid/aarch64/xor_check_neon.S
@@ -0,0 +1,271 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_check_neon
+.type xor_check_neon, %function
+
+/* int xor_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 2 */
+x_vects .req x0
+w_len .req w1
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+w_in .req w1 /* share w_len */
+x_src0 .req x3
+x_src0_end .req x4
+w_len256 .req w5 /* share w_len16 */
+x_len256 .req x5
+w_len16 .req w5
+x_len16 .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_srcn .req x9
+x_src_ptr_end .req x10
+w_xor .req w11
+/* v0 ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ * +----------+ +------------------+
+ * src --> | src[0] | - src0 -> | buffer | src0_end
+ * --------+----------+ +------------------+
+ * . | ...... |
+ * . +----------+ +------------------+
+ * src_ptr ~~> | src[n] | - srcn ~> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-1] |
+ * --------+----------+
+ * src_ptr_end -->
+ */
+
+xor_check_neon:
+ add x_src_ptr_end, x_src, x_vects, lsl #3
+ ldr x_src0, [x_src]
+ add x_src0_end, x_src0, x_len
+
+ sub w_vects, w_vects, #1
+ mov w_col, #0
+ mov w_xor, #0
+
+.Lloop256_init:
+ /* len256 = len - len%256; len %= 256 */
+ mov w_len256, w_len
+ and w_len, w_len, #0xFF
+ sub w_len256, w_len256, w_len
+
+ /* less than 256 byts? */
+ cbz w_len256, .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src0_end, x_src0_end, #256
+
+ /* batch process vects*256 bytes */
+.Lloop256:
+ ldr q0, [x_src0, #16*0]
+ ldr q1, [x_src0, #16*1]
+ ldr q2, [x_src0, #16*2]
+ ldr q3, [x_src0, #16*3]
+ ldr q4, [x_src0, #16*4]
+ ldr q5, [x_src0, #16*5]
+ ldr q6, [x_src0, #16*6]
+ ldr q7, [x_src0, #16*7]
+ ldr q8, [x_src0, #16*8]
+ ldr q9, [x_src0, #16*9]
+ ldr q10, [x_src0, #16*10]
+ ldr q11, [x_src0, #16*11]
+ ldr q12, [x_src0, #16*12]
+ ldr q13, [x_src0, #16*13]
+ ldr q14, [x_src0, #16*14]
+ ldr q15, [x_src0, #16*15]
+ add x_src0, x_src0, #256
+
+ cbz w_vects, .Lloop256_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop256_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+ ldr q24, [x_srcn, #16*8]
+ ldr q25, [x_srcn, #16*9]
+ ldr q26, [x_srcn, #16*10]
+ ldr q27, [x_srcn, #16*11]
+ ldr q28, [x_srcn, #16*12]
+ ldr q29, [x_srcn, #16*13]
+ ldr q30, [x_srcn, #16*14]
+ ldr q31, [x_srcn, #16*15]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+ eor v8.16b, v8.16b, v24.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v10.16b, v10.16b, v26.16b
+ eor v11.16b, v11.16b, v27.16b
+ eor v12.16b, v12.16b, v28.16b
+ eor v13.16b, v13.16b, v29.16b
+ eor v14.16b, v14.16b, v30.16b
+ eor v15.16b, v15.16b, v31.16b
+
+ bne .Lloop256_vects
+
+.Lloop256_vects_end:
+ orr v0.16b, v0.16b, v1.16b
+ orr v2.16b, v2.16b, v3.16b
+ orr v4.16b, v4.16b, v5.16b
+ orr v6.16b, v6.16b, v7.16b
+ orr v8.16b, v8.16b, v9.16b
+ orr v10.16b, v10.16b, v11.16b
+ orr v12.16b, v12.16b, v13.16b
+ orr v14.16b, v14.16b, v15.16b
+ orr v0.16b, v0.16b, v2.16b
+ orr v4.16b, v4.16b, v6.16b
+ orr v8.16b, v8.16b, v10.16b
+ orr v12.16b, v12.16b, v14.16b
+ orr v0.16b, v0.16b, v4.16b
+ orr v8.16b, v8.16b, v12.16b
+ orr v0.16b, v0.16b, v8.16b
+ umaxv b0, v0.16b
+ umov w_xor, v0.b[0]
+ cbnz w_xor, .Lloop256_end
+
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #256
+ bls .Lloop256
+
+.Lloop256_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ cbnz w_xor, .Lerror
+
+ add x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+ /* len16 = len - len%16; len %= 16 */
+ mov w_len16, w_len
+ and w_len, w_len, #0xF
+ sub w_len16, w_len16, w_len
+
+ /* less than 16 bytes? */
+ cbz w_len16, .Lloop1_init
+
+ sub x_src0_end, x_src0_end, #16
+
+ /* batch process vects*16 bytes */
+.Lloop16:
+ ldr q0, [x_src0], #16
+ cbz w_vects, .Lloop16_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_src_ptr_end
+ ldr q1, [x_srcn, x_col]
+ eor v0.16b, v0.16b, v1.16b
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ umaxv b0, v0.16b
+ umov w_xor, v0.b[0]
+ cbnz w_xor, .Lerror
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ add x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+ cbnz w_len, .Lloop1
+ mov w_ret, #0
+ ret
+
+ /* batch process vects*1 bytes */
+.Lloop1:
+ ldrb w_xor, [x_src0], #1
+ cbz w_vects, .Lloop1_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop1_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_src_ptr_end
+ ldrb w_in, [x_srcn, x_col]
+ eor w_xor, w_xor, w_in
+ bne .Lloop1_vects
+
+.Lloop1_vects_end:
+ cbnz w_xor, .Lerror
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #1
+ bne .Lloop1
+
+.Lloop1_end:
+ mov w_ret, #0
+ ret
+
+.Lerror:
+ mov w_ret, #1
+ ret
diff --git a/src/isa-l/raid/aarch64/xor_gen_neon.S b/src/isa-l/raid/aarch64/xor_gen_neon.S
new file mode 100644
index 000000000..00f65a2ef
--- /dev/null
+++ b/src/isa-l/raid/aarch64/xor_gen_neon.S
@@ -0,0 +1,264 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_gen_neon
+.type xor_gen_neon, %function
+
+/* int xor_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 2 */
+x_vects .req x0
+w_len .req w1
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+w_in .req w1 /* share w_len */
+x_src0 .req x3
+x_src0_end .req x4
+w_len256 .req w5 /* share w_len16, w_xor */
+x_len256 .req x5
+w_len16 .req w5
+x_len16 .req x5
+w_xor .req w5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_srcn .req x9
+x_dst .req x10
+x_dst_ptr .req x11
+/* v0 ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ * +----------+ +------------------+
+ * src --> | src[0] | - src0 -> | buffer | src0_end
+ * --------+----------+ +------------------+
+ * . | ...... |
+ * . +----------+ +------------------+
+ * src_ptr ~~> | src[n] | - srcn ~> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-2] |
+ * --------+----------+ +------------------+
+ * dst_ptr --> | src[v-1] | -- dst --> | buffer |
+ * +----------+ +------------------+
+ */
+
+xor_gen_neon:
+ add x_dst_ptr, x_src, x_vects, lsl #3
+ ldr x_dst, [x_dst_ptr, #-8]!
+ ldr x_src0, [x_src]
+ add x_src0_end, x_src0, x_len
+
+ sub w_vects, w_vects, #2
+ mov w_col, #0
+
+.Loop256_init:
+ /* len256 = len - len%256; len %= 256 */
+ mov w_len256, w_len
+ and w_len, w_len, #0xFF
+ sub w_len256, w_len256, w_len
+
+ /* less than 256 byts? */
+ cbz w_len256, .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src0_end, x_src0_end, #256
+
+ /* batch process (vects-1)*256 bytes */
+.Lloop256:
+ ldr q0, [x_src0, #16*0]
+ ldr q1, [x_src0, #16*1]
+ ldr q2, [x_src0, #16*2]
+ ldr q3, [x_src0, #16*3]
+ ldr q4, [x_src0, #16*4]
+ ldr q5, [x_src0, #16*5]
+ ldr q6, [x_src0, #16*6]
+ ldr q7, [x_src0, #16*7]
+ ldr q8, [x_src0, #16*8]
+ ldr q9, [x_src0, #16*9]
+ ldr q10, [x_src0, #16*10]
+ ldr q11, [x_src0, #16*11]
+ ldr q12, [x_src0, #16*12]
+ ldr q13, [x_src0, #16*13]
+ ldr q14, [x_src0, #16*14]
+ ldr q15, [x_src0, #16*15]
+ add x_src0, x_src0, #256
+
+ cbz w_vects, .Lloop256_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop256_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_dst_ptr
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+ ldr q24, [x_srcn, #16*8]
+ ldr q25, [x_srcn, #16*9]
+ ldr q26, [x_srcn, #16*10]
+ ldr q27, [x_srcn, #16*11]
+ ldr q28, [x_srcn, #16*12]
+ ldr q29, [x_srcn, #16*13]
+ ldr q30, [x_srcn, #16*14]
+ ldr q31, [x_srcn, #16*15]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+ eor v8.16b, v8.16b, v24.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v10.16b, v10.16b, v26.16b
+ eor v11.16b, v11.16b, v27.16b
+ eor v12.16b, v12.16b, v28.16b
+ eor v13.16b, v13.16b, v29.16b
+ eor v14.16b, v14.16b, v30.16b
+ eor v15.16b, v15.16b, v31.16b
+
+ bne .Lloop256_vects
+
+.Lloop256_vects_end:
+ str q0, [x_dst, #16*0]
+ str q1, [x_dst, #16*1]
+ str q2, [x_dst, #16*2]
+ str q3, [x_dst, #16*3]
+ str q4, [x_dst, #16*4]
+ str q5, [x_dst, #16*5]
+ str q6, [x_dst, #16*6]
+ str q7, [x_dst, #16*7]
+ str q8, [x_dst, #16*8]
+ str q9, [x_dst, #16*9]
+ str q10, [x_dst, #16*10]
+ str q11, [x_dst, #16*11]
+ str q12, [x_dst, #16*12]
+ str q13, [x_dst, #16*13]
+ str q14, [x_dst, #16*14]
+ str q15, [x_dst, #16*15]
+
+ cmp x_src0, x_src0_end
+ add x_dst, x_dst, #256
+ add w_col, w_col, #256
+ bls .Lloop256
+
+.Lloop256_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+ /* len16 = len - len%16; len %= 16 */
+ mov w_len16, w_len
+ and w_len, w_len, #0xF
+ sub w_len16, w_len16, w_len
+
+ /* less than 16 bytes? */
+ cbz w_len16, .Lloop1_init
+
+ sub x_src0_end, x_src0_end, #16
+
+ /* batch process (vects-1)*16 bytes */
+.Lloop16:
+ ldr q0, [x_src0], #16
+ cbz w_vects, .Lloop16_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_dst_ptr
+ ldr q1, [x_srcn, x_col]
+ eor v0.16b, v0.16b, v1.16b
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ cmp x_src0, x_src0_end
+ str q0, [x_dst], #16
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Loop16_end:
+ add x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+ cbnz w_len, .Lloop1
+ mov w_ret, #0
+ ret
+
+ /* batch process (vects-1)*1 bytes */
+.Lloop1:
+ ldrb w_xor, [x_src0], #1
+ cbz w_vects, .Lloop1_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop1_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_dst_ptr
+ ldrb w_in, [x_srcn, x_col]
+ eor w_xor, w_xor, w_in
+ bne .Lloop1_vects
+
+.Lloop1_vects_end:
+ cmp x_src0, x_src0_end
+ strb w_xor, [x_dst], #1
+ add w_col, w_col, #1
+ bne .Lloop1
+
+.Loop1_end:
+ mov w_ret, #0
+ ret
diff --git a/src/isa-l/raid/pq_check_sse.asm b/src/isa-l/raid/pq_check_sse.asm
new file mode 100644
index 000000000..f2bc8a6cd
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse.asm
@@ -0,0 +1,277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_check_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 7*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm15, 6*16
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+mk_global pq_check_sse, function
+func(pq_check_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ movdqa xpoly, [poly]
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Do end of vec first and run backward
+
+loop48:
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead
+ XLDR xp3, [ptr+pos+32] ;Initialize xp3 with P2 src + 32B ahead
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+ pxor xq3, xq3 ;q3 = 0
+
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xp3, xs3 ; p3 ^= s2
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xq3, xs3 ; q3 ^= s3
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pxor xtmp3, xtmp3 ; xtmp3 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ pand xtmp3, xpoly ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ paddb xq3, xq3 ; q3 = q3<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ pxor xp3, xs3 ;p3 ^= s3[0]
+ pxor xq3, xs3 ;q3 ^= 1 * s3[0]
+
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src
+ XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead
+ XLDR xtmp3, [tmp+pos+32] ;re-init xq3 with Q2 src + 32B ahead
+
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+ pxor xq2, xtmp2
+ pxor xq3, xtmp3
+
+ por xp1, xq1 ;Confirm that all P&Q parity are 0
+ por xp1, xp2
+ por xp1, xq2
+ por xp1, xp3
+ por xp1, xq3
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ pxor xq1, xq1 ;q = 0
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q ^= s
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+
+ por xp1, xq1 ;Confirm that all P&Q parity are = 0
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_check_sse, 00, 06, 0033
diff --git a/src/isa-l/raid/pq_check_sse_i32.asm b/src/isa-l/raid/pq_check_sse_i32.asm
new file mode 100644
index 000000000..3271c035a
--- /dev/null
+++ b/src/isa-l/raid/pq_check_sse_i32.asm
@@ -0,0 +1,282 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define return rax
+ %define PS 8
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define PS 8
+ %define tmp r11
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 edx
+ %define arg1 ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x: endbranch
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg0, arg(0)
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer?
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4 ; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+mk_global pq_check_sse, function
+func(pq_check_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+%ifidn PS,8
+ movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg
+%endif
+ cmp len, 32
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 32 ;Do end of vec first and run backward
+
+loop32:
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src
+ XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead
+
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+ pxor xq2, xtmp2
+
+ por xp1, xq1 ;Confirm that all P&Q parity are 0
+ por xp1, xp2
+ por xp1, xq2
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 32
+ cmp pos, len
+ jle loop32
+
+
+ ;; ------------------------------
+ ;; Do last 16 Bytes remaining
+ add len, 32
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src
+ pxor xq1, xq1 ;q = 0
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q ^= s
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src
+ pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved
+
+ por xp1, xq1 ;Confirm that all P&Q parity are = 0
+ ptest xp1, xp1
+ jnz return_fail
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_check_sse, 00, 06, 0033
diff --git a/src/isa-l/raid/pq_check_test.c b/src/isa-l/raid/pq_check_test.c
new file mode 100644
index 000000000..27d0203d2
--- /dev/null
+++ b/src/isa-l/raid/pq_check_test.c
@@ -0,0 +1,304 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int ref_multi_pq(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char p, q, s;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ p ^= s = src[j][i];
+ q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); // mult by GF{2}
+ }
+
+ src[vects - 2][i] = p; // second to last pointer is p
+ src[vects - 1][i] = q; // last pointer is q
+ }
+ return 0;
+}
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 2];
+ char c;
+ char *tmp_buf[TEST_SOURCES + 2];
+ int serr, lerr;
+
+ printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 16, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("\nfail zero test %d\n", ret);
+ }
+
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test %d\n", ret);
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ ((char *)buffs[j])[i] = 0x5; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = 0; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs);
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("fail first rand test %d\n", ret);
+ }
+
+ c = ((char *)(buffs[0]))[TEST_LEN - 2];
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nFail corrupt buffer test, passed when should have failed\n");
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources w/ random data
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ // Check it still passes
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret != 0) { // should pass
+ fail++;
+ printf
+ ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+ j, i);
+ return 1;
+ }
+ c = ((char *)buffs[j])[i];
+ ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs);
+ if (ret == 0) { // Check it now fails
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = c; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test various number of sources, full length
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ // New random data
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ // Generate p,q parity for this number of sources
+ ref_multi_pq(j, TEST_LEN, buffs);
+
+ // Set errors up in each source and len position
+ for (i = 0; i < j; i++) {
+ for (k = 0; k < TEST_LEN; k++) {
+ // See if it still passes
+ ret = pq_check(j, TEST_LEN, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand fixed len test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[i])[k];
+ ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer
+
+ ret = pq_check(j, TEST_LEN, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand fixed len test corrupted buffer %d sources\n",
+ j);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[i])[k] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 16;
+ while (k <= TEST_LEN) {
+ char *tmp;
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ // Generate p,q parity for this number of sources
+ ref_multi_pq(j, k, buffs);
+
+ // Inject errors at various source and len positions
+ for (lerr = 0; lerr < k; lerr++) {
+ for (serr = 0; serr < j; serr++) {
+ // See if it still passes
+ ret = pq_check(j, k, buffs);
+ if (ret != 0) { // Should pass
+ printf
+ ("\nfail rand var src, len test %d sources, len=%d\n",
+ j, k);
+ fail++;
+ return 1;
+ }
+
+ tmp = (char *)buffs[serr];
+ c = tmp[lerr];
+ ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer
+
+ ret = pq_check(j, k, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand var src, len test corrupted buffer "
+ "%d sources, len=%d, ret=%d\n", j, k,
+ ret);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ fflush(0);
+ }
+ k += 16;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 16) {
+ for (j = 0; j < TEST_SOURCES + 2; j++) {
+ rand_buffer(buffs[j], TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+
+ // Test good data
+ ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+ if (ret != 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ fail++;
+ return 1;
+ }
+ // Test bad data
+ for (serr = 0; serr < TEST_SOURCES + 2; serr++) {
+ for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+ c = tmp_buf[serr][lerr];
+ tmp_buf[serr][lerr] = c ^ 1;
+
+ ret =
+ pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf);
+ if (ret == 0) {
+ printf("fail end test corrupted buffer - "
+ "offset: %d, len: %d, ret: %d\n", i,
+ TEST_LEN - i, ret);
+ fail++;
+ return 1;
+ }
+
+ tmp_buf[serr][lerr] = c;
+ }
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (fail == 0)
+ printf("Pass\n");
+
+ return fail;
+
+}
diff --git a/src/isa-l/raid/pq_gen_avx.asm b/src/isa-l/raid/pq_gen_avx.asm
new file mode 100644
index 000000000..db4bcfb1c
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 8*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm14
+ vmovdqa [rsp + 7*16], xmm15
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm14, [rsp + 6*16]
+ vmovdqa xmm15, [rsp + 7*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xzero xmm14
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+mk_global pq_gen_avx, function
+func(pq_gen_avx)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ vmovdqa xpoly, [poly]
+ vpxor xzero, xzero, xzero
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Len points to last block
+
+loop48:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p1 = 0
+ vpxor xp2, xp2, xp2 ;p2 = 0
+ vpxor xp3, xp3, xp3 ;p3 = 0
+ vpxor xq1, xq1, xq1 ;q1 = 0
+ vpxor xq2, xq2, xq2 ;q2 = 0
+ vpxor xq3, xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpxor xq2, xq2, xs2 ; q2 ^= s2
+ vpxor xq3, xq3, xs3 ; q3 ^= s3
+ vpxor xp1, xp1, xs1 ; p1 ^= s1
+ vpxor xp2, xp2, xs2 ; p2 ^= s2
+ vpxor xp3, xp3, xs3 ; p3 ^= s2
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+ vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpaddb xq3, xq3, xq3 ; q3 = q3<<1
+ vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxor xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ vpxor xp3, xp3, xs3 ;p3 ^= s3[0]
+ vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+32], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p = 0
+ vpxor xq1, xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpxor xp1, xp1, xs1 ; p ^= s
+ vpaddb xq1, xq1, xq1 ; q = q<<1
+ vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_avx, 02, 0a, 0039
diff --git a/src/isa-l/raid/pq_gen_avx2.asm b/src/isa-l/raid/pq_gen_avx2.asm
new file mode 100644
index 000000000..a0bf0cc40
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx2.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX
+;;; int pq_gen_avx(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 32 bytes. Length must be 32 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 8*32 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ ;; Until a sav_ymm256 is defined
+ vmovdqu [rsp + 0*32], ymm6
+ vmovdqu [rsp + 1*32], ymm7
+ vmovdqu [rsp + 2*32], ymm8
+ vmovdqu [rsp + 3*32], ymm9
+ vmovdqu [rsp + 4*32], ymm10
+ vmovdqu [rsp + 5*32], ymm11
+ vmovdqu [rsp + 6*32], ymm14
+ vmovdqu [rsp + 7*32], ymm15
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqu ymm6, [rsp + 0*32]
+ vmovdqu ymm7, [rsp + 1*32]
+ vmovdqu ymm8, [rsp + 2*32]
+ vmovdqu ymm9, [rsp + 3*32]
+ vmovdqu ymm10, [rsp + 4*32]
+ vmovdqu ymm11, [rsp + 5*32]
+ vmovdqu ymm14, [rsp + 6*32]
+ vmovdqu ymm15, [rsp + 7*32]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 ymm0
+%define xq1 ymm1
+%define xtmp1 ymm2
+%define xs1 ymm3
+
+%define xp2 ymm4
+%define xq2 ymm5
+%define xtmp2 ymm6
+%define xs2 ymm7
+
+%define xp3 ymm8
+%define xq3 ymm9
+%define xtmp3 ymm10
+%define xs3 ymm11
+
+%define xzero ymm14
+%define xpoly ymm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+mk_global pq_gen_avx2, function
+func(pq_gen_avx2)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (32-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ vmovdqa xpoly, [poly]
+ vpxor xzero, xzero, xzero
+ cmp len, 96
+ jl loop32
+
+len_aligned_32bytes:
+ sub len, 3*32 ;Len points to last block
+
+loop96:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+32] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+64] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p1 = 0
+ vpxor xp2, xp2, xp2 ;p2 = 0
+ vpxor xp3, xp3, xp3 ;p3 = 0
+ vpxor xq1, xq1, xq1 ;q1 = 0
+ vpxor xq2, xq2, xq2 ;q2 = 0
+ vpxor xq3, xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpxor xq2, xq2, xs2 ; q2 ^= s2
+ vpxor xq3, xq3, xs3 ; q3 ^= s3
+ vpxor xp1, xp1, xs1 ; p1 ^= s1
+ vpxor xp2, xp2, xs2 ; p2 ^= s2
+ vpxor xp3, xp3, xs3 ; p3 ^= s2
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00
+ vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+32] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+64] ; Get next vector (source data3)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpaddb xq3, xq3, xq3 ; q3 = q3<<1
+ vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxor xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ vpxor xp3, xp3, xs3 ;p3 ^= s3[0]
+ vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+32], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+64], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+32], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+64], xq3 ;Write parity Q3 vector
+ add pos, 3*32
+ cmp pos, len
+ jle loop96
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 3*32
+ cmp pos, len
+ je return_pass
+
+loop32:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ vpxor xp1, xp1, xp1 ;p = 0
+ vpxor xq1, xq1, xq1 ;q = 0
+
+next_vect32:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxor xq1, xq1, xs1 ; q1 ^= s1
+ vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
+ vpxor xp1, xp1, xs1 ; p ^= s
+ vpaddb xq1, xq1, xq1 ; q = q<<1
+ vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect32 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded
+ vpxor xq1, xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 32
+ cmp pos, len
+ jl loop32
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 32
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_avx2, 04, 03, 0041
diff --git a/src/isa-l/raid/pq_gen_avx512.asm b/src/isa-l/raid/pq_gen_avx512.asm
new file mode 100644
index 000000000..179ad5c28
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_avx512.asm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using AVX512
+;;; int pq_gen_avx512(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined.
+;;; Length must be 32 byte multiple.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 4*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+ vmovdqu [rsp + 2*16], xmm8
+ vmovdqu [rsp + 3*16], xmm9
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ vmovdqu xmm8, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 3*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 zmm0
+%define xq1 zmm1
+%define xtmp1 zmm2
+%define xs1 zmm3
+
+%define xp2 zmm4
+%define xq2 zmm5
+%define xtmp2 zmm6
+%define xs2 zmm7
+
+%define xzero zmm8
+%define xpoly zmm9
+
+%define xp1y ymm0
+%define xq1y ymm1
+%define xtmp1y ymm2
+%define xs1y ymm3
+%define xzeroy ymm8
+%define xpolyy ymm9
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8 ;u8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+mk_global pq_gen_avx512, function
+func(pq_gen_avx512)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (32-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ mov tmp, 0x1d
+ vpbroadcastb xpoly, tmp
+ vpxorq xzero, xzero, xzero
+ cmp len, 128
+ jl loop32
+
+len_aligned_32bytes:
+ sub len, 2*64 ;Len points to last block
+
+loop128:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+64] ;Preload last vector (source)
+ vpxorq xp1, xp1, xp1 ;p1 = 0
+ vpxorq xp2, xp2, xp2 ;p2 = 0
+ vpxorq xq1, xq1, xq1 ;q1 = 0
+ vpxorq xq2, xq2, xq2 ;q2 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxorq xq1, xq1, xs1 ; q1 ^= s1
+ vpxorq xq2, xq2, xs2 ; q2 ^= s2
+ vpxorq xp1, xp1, xs1 ; p1 ^= s1
+ vpxorq xp2, xp2, xs2 ; p2 ^= s2
+ vpcmpb k1, xq1, xzero, 1
+ vpcmpb k2, xq2, xzero, 1
+ vpblendmb xtmp1 {k1}, xzero, xpoly
+ vpblendmb xtmp2 {k2}, xzero, xpoly
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+64] ; Get next vector (source data2)
+ vpaddb xq1, xq1, xq1 ; q1 = q1<<1
+ vpaddb xq2, xq2, xq2 ; q2 = q2<<1
+ vpxorq xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ vpxorq xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxorq xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ vpxorq xq1, xq1, xs1 ;q1 ^= 1 * s1[0]
+ vpxorq xp2, xp2, xs2 ;p2 ^= s2[0]
+ vpxorq xq2, xq2, xs2 ;q2 ^= 1 * s2[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+64], xp2 ;Write parity P2 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+64], xq2 ;Write parity Q2 vector
+ add pos, 2*64
+ cmp pos, len
+ jle loop128
+
+ ;; ------------------------------
+ ;; Do last 32 or 64 Bytes remaining
+ add len, 2*64
+ cmp pos, len
+ je return_pass
+
+loop32:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1y, [ptr+pos] ;Preload last vector (source)
+ vpxorq xp1y, xp1y, xp1y ;p = 0
+ vpxorq xq1y, xq1y, xq1y ;q = 0
+
+next_vect32:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ vpxorq xq1y, xq1y, xs1y ; q1 ^= s1
+ vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00
+ vpxorq xp1y, xp1y, xs1y ; p ^= s
+ vpaddb xq1y, xq1y, xq1y ; q = q<<1
+ vpxorq xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked
+ XLDR xs1y, [ptr+pos] ; Get next vector (source data)
+ jg next_vect32 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
+ vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1y ;Write parity P vector
+ XSTR [tmp+pos], xq1y ;Write parity Q vector
+ add pos, 32
+ cmp pos, len
+ jl loop32
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/pq_gen_perf.c b/src/isa-l/raid/pq_gen_perf.c
new file mode 100644
index 000000000..7315c82b3
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_perf.c
@@ -0,0 +1,88 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test. Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+# define TEST_TYPE_STR "_cold"
+# else
+# define TEST_TYPE_STR "_cus"
+# endif
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+ int i;
+ void *buffs[TEST_SOURCES + 2];
+ struct perf start;
+
+ printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ int ret;
+ void *buf;
+ ret = posix_memalign(&buf, 64, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Setup data
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ // Warm up
+ BENCHMARK(&start, BENCHMARK_TIME, pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs));
+ printf("pq_gen" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_MEM);
+
+ return 0;
+}
diff --git a/src/isa-l/raid/pq_gen_sse.asm b/src/isa-l/raid/pq_gen_sse.asm
new file mode 100644
index 000000000..b6d51481b
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define return rax
+ %define stack_size 7*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm15, 6*16
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm15, [rsp + 6*16]
+ add rsp, stack_size
+ %endmacro
+%endif
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos rax
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%define xp3 xmm8
+%define xq3 xmm9
+%define xtmp3 xmm10
+%define xs3 xmm11
+
+%define xpoly xmm15
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+align 16
+mk_global pq_gen_sse, function
+func(pq_gen_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+ movdqa xpoly, [poly]
+ cmp len, 48
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 48 ;Len points to last block
+
+loop48:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ XLDR xs3, [ptr+pos+32] ;Preload last vector (source)
+ pxor xp1, xp1 ;p1 = 0
+ pxor xp2, xp2 ;p2 = 0
+ pxor xp3, xp3 ;p3 = 0
+ pxor xq1, xq1 ;q1 = 0
+ pxor xq2, xq2 ;q2 = 0
+ pxor xq3, xq3 ;q3 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xq3, xs3 ; q3 ^= s3
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xp3, xs3 ; p3 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pxor xtmp3, xtmp3 ; xtmp3 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ pand xtmp3, xpoly ; xtmp3 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ XLDR xs3, [ptr+pos+32] ; Get next vector (source data3)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ paddb xq3, xq3 ; q3 = q3<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ pxor xp3, xs3 ;p3 ^= s3[0]
+ pxor xq3, xs3 ;q3 ^= 1 * s3[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [ptr+pos+32], xp3 ;Write parity P3 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector
+ add pos, 48
+ cmp pos, len
+ jle loop48
+
+ ;; ------------------------------
+ ;; Do last 16 or 32 Bytes remaining
+ add len, 48
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*8] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ pxor xp1, xp1 ;p = 0
+ pxor xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*8] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
+ mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_sse, 00, 09, 0032
diff --git a/src/isa-l/raid/pq_gen_sse_i32.asm b/src/isa-l/raid/pq_gen_sse_i32.asm
new file mode 100644
index 000000000..8dabb783f
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_sse_i32.asm
@@ -0,0 +1,264 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized pq of N source vectors using SSE3
+;;; int pq_gen_sse(int vects, int len, void **array)
+
+;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers
+;;; (**array). Last two pointers are the P and Q destinations respectively.
+;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define return rax
+ %define PS 8
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define PS 8
+ %define tmp r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 edx
+ %define arg1 ecx
+ %define return eax
+ %define PS 4
+ %define func(x) x: endbranch
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg0, arg(0)
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer?
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos return
+
+%define xp1 xmm0
+%define xq1 xmm1
+%define xtmp1 xmm2
+%define xs1 xmm3
+
+%define xp2 xmm4
+%define xq2 xmm5
+%define xtmp2 xmm6
+%define xs2 xmm7
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+ %define xpoly xmm15
+%elifidn PS,4 ; 32-bit code
+ %define xpoly [poly]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+mk_global pq_gen_sse, function
+func(pq_gen_sse)
+ FUNC_SAVE
+ sub vec, 3 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (16-1) ;Check alignment of length
+ jnz return_fail
+ mov pos, 0
+%ifidn PS,8
+ movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg
+%endif
+ cmp len, 32
+ jl loop16
+
+len_aligned_32bytes:
+ sub len, 32 ;Do end of vec first and run backward
+
+loop32:
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ XLDR xs2, [ptr+pos+16] ;Preload last vector (source)
+ pxor xp1, xp1 ;p1 = 0
+ pxor xq1, xq1 ;q1 = 0
+ pxor xp2, xp2 ;p2 = 0
+ pxor xq2, xq2 ;q2 = 0
+
+next_vect:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xq2, xs2 ; q2 ^= s2
+ pxor xp1, xs1 ; p1 ^= s1
+ pxor xp2, xs2 ; p2 ^= s2
+ pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0
+ pxor xtmp2, xtmp2 ; xtmp2 = 0
+ pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set
+ pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp1 = poly or 0x00
+ pand xtmp2, xpoly ; xtmp2 = poly or 0x00
+ XLDR xs1, [ptr+pos] ; Get next vector (source data1)
+ XLDR xs2, [ptr+pos+16] ; Get next vector (source data2)
+ paddb xq1, xq1 ; q1 = q1<<1
+ paddb xq2, xq2 ; q2 = q2<<1
+ pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
+ pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
+ jg next_vect ; Loop for each vect except 0
+
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded
+ pxor xq1, xs1 ;q1 ^= 1 * s1[0]
+ pxor xp2, xs2 ;p2 ^= s2[0]
+ pxor xq2, xs2 ;q2 ^= 1 * s2[0]
+ XSTR [ptr+pos], xp1 ;Write parity P1 vector
+ XSTR [ptr+pos+16], xp2 ;Write parity P2 vector
+ XSTR [tmp+pos], xq1 ;Write parity Q1 vector
+ XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector
+ add pos, 32
+ cmp pos, len
+ jle loop32
+
+ ;; ------------------------------
+ ;; Do last 16 Bytes remaining
+ add len, 32
+ cmp pos, len
+ je return_pass
+
+loop16:
+ mov ptr, [arg2+vec*PS] ;Fetch last source pointer
+ mov tmp, vec ;Set tmp to point back to last vector
+ XLDR xs1, [ptr+pos] ;Preload last vector (source)
+ pxor xp1, xp1 ;p = 0
+ pxor xq1, xq1 ;q = 0
+
+next_vect16:
+ sub tmp, 1 ;Inner loop for each source vector
+ mov ptr, [arg2+tmp*PS] ; get pointer to next vect
+ pxor xq1, xs1 ; q1 ^= s1
+ pxor xtmp1, xtmp1 ; xtmp = 0
+ pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set
+ pand xtmp1, xpoly ; xtmp = poly or 0x00
+ pxor xp1, xs1 ; p ^= s
+ paddb xq1, xq1 ; q = q<<1
+ pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked
+ XLDR xs1, [ptr+pos] ; Get next vector (source data)
+ jg next_vect16 ; Loop for each vect except 0
+
+ mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector
+ mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector
+ pxor xp1, xs1 ;p ^= s[0] - last source is already loaded
+ pxor xq1, xs1 ;q ^= 1 * s[0]
+ XSTR [ptr+pos], xp1 ;Write parity P vector
+ XSTR [tmp+pos], xq1 ;Write parity Q vector
+ add pos, 16
+ cmp pos, len
+ jl loop16
+
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+align 16
+poly:
+dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d
+
+;;; func core, ver, snum
+slversion pq_gen_sse, 00, 08, 0032
diff --git a/src/isa-l/raid/pq_gen_test.c b/src/isa-l/raid/pq_gen_test.c
new file mode 100644
index 000000000..3469f7e50
--- /dev/null
+++ b/src/isa-l/raid/pq_gen_test.c
@@ -0,0 +1,194 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<limits.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int dump(unsigned char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", buf[i++]);
+ if (i % 16 == 0)
+ printf("\n");
+ }
+ printf("\n");
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 2]; // Pointers to src and dest
+ char *tmp_buf[TEST_SOURCES + 2];
+
+ printf("Test pq_gen_test ");
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 2; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 32, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES])[i] != 0)
+ fail++;
+ }
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES + 1])[i] != 0)
+ fail++;
+ }
+
+ if (fail > 0) {
+ printf("fail zero test %d\n", fail);
+ return 1;
+ } else
+ putchar('.');
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 2; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs);
+ fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ int t;
+ printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret);
+ for (t = 0; t < TEST_SOURCES + 2; t++)
+ dump(buffs[t], 15);
+
+ printf(" reference function p,q\n");
+ pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs);
+ for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++)
+ dump(buffs[t], 15);
+
+ return 1;
+ } else
+ putchar('.');
+
+ // Test various number of sources
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ pq_gen(j, TEST_LEN, buffs);
+ fail |= pq_check_base(j, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources\n", j);
+ return 1;
+ } else
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 4; j <= TEST_SOURCES + 2; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ ret = pq_gen(j, k, buffs);
+ fail |= pq_check_base(j, k, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources, len=%d, fail="
+ "%d, ret=%d\n", j, k, fail, ret);
+ return 1;
+ }
+ }
+ putchar('.');
+ k += 32;
+ }
+
+ // Test at the end of buffer
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 0; j < (TEST_SOURCES + 2); j++) {
+ rand_buffer(buffs[j], TEST_LEN - k);
+ tmp_buf[j] = (char *)buffs[j] + k;
+ }
+
+ ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+ fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf);
+
+ if (fail > 0) {
+ printf("fail end test - offset: %d, len: %d, fail: %d, "
+ "ret: %d\n", k, TEST_LEN - k, fail, ret);
+ return 1;
+ }
+
+ putchar('.');
+ fflush(0);
+ k += 32;
+ }
+
+ if (!fail)
+ printf(" done: Pass\n");
+
+ return fail;
+}
diff --git a/src/isa-l/raid/raid_base.c b/src/isa-l/raid/raid_base.c
new file mode 100644
index 000000000..e066eb851
--- /dev/null
+++ b/src/isa-l/raid/raid_base.c
@@ -0,0 +1,147 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <limits.h>
+#include <stdint.h>
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+# define notbit0 0xfefefefefefefefeULL
+# define bit7 0x8080808080808080ULL
+# define gf8poly 0x1d1d1d1d1d1d1d1dULL
+#else
+# define notbit0 0xfefefefeUL
+# define bit7 0x80808080UL
+# define gf8poly 0x1d1d1d1dUL
+#endif
+
+int pq_gen_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned long p, q, s;
+ unsigned long **src = (unsigned long **)array;
+ int blocks = len / sizeof(long);
+
+ for (i = 0; i < blocks; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ p ^= s = src[j][i];
+ q = s ^ (((q << 1) & notbit0) ^ // shift each byte
+ ((((q & bit7) << 1) - ((q & bit7) >> 7)) // mask out bytes
+ & gf8poly)); // apply poly
+ }
+
+ src[vects - 2][i] = p; // second to last pointer is p
+ src[vects - 1][i] = q; // last pointer is q
+ }
+ return 0;
+}
+
+int pq_check_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char p, q, s;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ q = p = src[vects - 3][i];
+
+ for (j = vects - 4; j >= 0; j--) {
+ s = src[j][i];
+ p ^= s;
+
+ // mult by GF{2}
+ q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0));
+ }
+
+ if (src[vects - 2][i] != p) // second to last pointer is p
+ return i | 1;
+ if (src[vects - 1][i] != q) // last pointer is q
+ return i | 2;
+ }
+ return 0;
+}
+
+int xor_gen_base(int vects, int len, void **array)
+{
+ int i, j;
+ unsigned char parity;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ parity = src[0][i];
+ for (j = 1; j < vects - 1; j++)
+ parity ^= src[j][i];
+
+ src[vects - 1][i] = parity; // last pointer is dest
+
+ }
+
+ return 0;
+}
+
+int xor_check_base(int vects, int len, void **array)
+{
+ int i, j, fail = 0;
+
+ unsigned char parity;
+ unsigned char **src = (unsigned char **)array;
+
+ for (i = 0; i < len; i++) {
+ parity = 0;
+ for (j = 0; j < vects; j++)
+ parity ^= src[j][i];
+
+ if (parity != 0) {
+ fail = 1;
+ break;
+ }
+ }
+ if (fail && len > 0)
+ return len;
+ return fail;
+}
+
+struct slver {
+ unsigned short snum;
+ unsigned char ver;
+ unsigned char core;
+};
+
+struct slver pq_gen_base_slver_0001012a;
+struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 };
+
+struct slver xor_gen_base_slver_0001012b;
+struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 };
+
+struct slver pq_check_base_slver_0001012c;
+struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 };
+
+struct slver xor_check_base_slver_0001012d;
+struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 };
diff --git a/src/isa-l/raid/raid_base_aliases.c b/src/isa-l/raid/raid_base_aliases.c
new file mode 100644
index 000000000..f81792a00
--- /dev/null
+++ b/src/isa-l/raid/raid_base_aliases.c
@@ -0,0 +1,50 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "raid.h"
+
+int pq_gen(int vects, int len, void **array)
+{
+ return pq_gen_base(vects, len, array);
+}
+
+int pq_check(int vects, int len, void **array)
+{
+ return pq_check_base(vects, len, array);
+}
+
+int xor_gen(int vects, int len, void **array)
+{
+ return xor_gen_base(vects, len, array);
+}
+
+int xor_check(int vects, int len, void **array)
+{
+ return xor_check_base(vects, len, array);
+}
diff --git a/src/isa-l/raid/raid_multibinary.asm b/src/isa-l/raid/raid_multibinary.asm
new file mode 100644
index 000000000..47ef1e369
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary.asm
@@ -0,0 +1,143 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+default rel
+[bits 64]
+
+extern pq_gen_base
+extern pq_gen_sse
+extern pq_gen_avx
+extern pq_gen_avx2
+
+extern xor_gen_base
+extern xor_gen_sse
+extern xor_gen_avx
+
+extern pq_check_base
+extern pq_check_sse
+
+extern xor_check_base
+extern xor_check_sse
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern xor_gen_avx512
+ extern pq_gen_avx512
+%endif
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+
+
+mbin_dispatch_init6 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx, xor_gen_avx512
+mbin_dispatch_init6 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2, pq_gen_avx512
+
+section .data
+
+xor_check_dispatched:
+ dq xor_check_mbinit
+pq_check_dispatched:
+ dq pq_check_mbinit
+
+section .text
+
+;;;;
+; pq_check multibinary function
+;;;;
+mk_global pq_check, function
+pq_check_mbinit:
+ endbranch
+ call pq_check_dispatch_init
+pq_check:
+ endbranch
+ jmp qword [pq_check_dispatched]
+
+pq_check_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [pq_check_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea rbx, [pq_check_sse WRT_OPT]
+ cmovne rsi, rbx
+
+ mov [pq_check_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+
+;;;;
+; xor_check multibinary function
+;;;;
+mk_global xor_check, function
+xor_check_mbinit:
+ endbranch
+ call xor_check_dispatch_init
+xor_check:
+ endbranch
+ jmp qword [xor_check_dispatched]
+
+xor_check_dispatch_init:
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ lea rsi, [xor_check_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea rbx, [xor_check_sse WRT_OPT]
+ cmovne rsi, rbx
+
+ mov [xor_check_dispatched], rsi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+ ret
+
+;;; func core, ver, snum
+slversion xor_gen, 00, 03, 0126
+slversion xor_check, 00, 03, 0127
+slversion pq_gen, 00, 03, 0128
+slversion pq_check, 00, 03, 0129
diff --git a/src/isa-l/raid/raid_multibinary_i32.asm b/src/isa-l/raid/raid_multibinary_i32.asm
new file mode 100644
index 000000000..eee7fd5a1
--- /dev/null
+++ b/src/isa-l/raid/raid_multibinary_i32.asm
@@ -0,0 +1,52 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+[bits 32]
+
+extern xor_gen_base
+extern xor_gen_sse
+extern pq_gen_base
+extern pq_gen_sse
+extern xor_check_base
+extern xor_check_sse
+extern pq_check_base
+extern pq_check_sse
+
+mbin_interface xor_gen
+mbin_interface pq_gen
+mbin_interface xor_check
+mbin_interface pq_check
+
+mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_sse, xor_gen_sse
+mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_sse, pq_gen_sse
+mbin_dispatch_init5 xor_check, xor_check_base, xor_check_sse, xor_check_sse, xor_check_sse
+mbin_dispatch_init5 pq_check, pq_check_base, pq_check_sse, pq_check_sse, pq_check_sse
diff --git a/src/isa-l/raid/xor_check_sse.asm b/src/isa-l/raid/xor_check_sse.asm
new file mode 100644
index 000000000..a5fe0b2e0
--- /dev/null
+++ b/src/isa-l/raid/xor_check_sse.asm
@@ -0,0 +1,285 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 rax
+ %define tmp2.b al
+ %define tmp3 arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define tmp2 rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp r11
+ %define tmp3 r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 arg(0)
+ %define arg1 ecx
+ %define tmp2 eax
+ %define tmp2.b al
+ %define tmp3 edx
+ %define return eax
+ %define PS 4
+ %define func(x) x: endbranch
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+mk_global xor_check_sse, function
+func(xor_check_sse)
+ FUNC_SAVE
+%ifidn PS,8 ;64-bit code
+ sub vec, 1 ; Keep as offset to last source
+%else ;32-bit code
+ mov tmp, arg(0) ; Update vec length arg to last source
+ sub tmp, 1
+ mov arg(0), tmp
+%endif
+
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+ mov tmp, vec ;Preset to last vector
+
+loop128:
+ mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7
+ XLDR xmm2, [tmp2+pos+(2*16)]
+ XLDR xmm3, [tmp2+pos+(3*16)]
+ XLDR xmm4, [tmp2+pos+(4*16)]
+ XLDR xmm5, [tmp2+pos+(5*16)]
+ XLDR xmm6, [tmp2+pos+(6*16)]
+ XLDR xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ xorpd xmm0, [ptr+pos] ;Get next vector (source)
+ xorpd xmm1, [ptr+pos+16]
+ xorpd xmm2, [ptr+pos+(2*16)]
+ xorpd xmm3, [ptr+pos+(3*16)]
+ xorpd xmm4, [ptr+pos+(4*16)]
+ xorpd xmm5, [ptr+pos+(5*16)]
+ xorpd xmm6, [ptr+pos+(6*16)]
+ xorpd xmm7, [ptr+pos+(7*16)]
+;;; prefetch [ptr+pos+(8*16)]
+ jge next_vect ;Loop for each vect
+
+ ;; End of vects, chech that all parity regs = 0
+ mov tmp, vec ;Back to last vector
+ por xmm0, xmm1
+ por xmm0, xmm2
+ por xmm0, xmm3
+ por xmm0, xmm4
+ por xmm0, xmm5
+ por xmm0, xmm6
+ por xmm0, xmm7
+ ptest xmm0, xmm0
+ jnz return_fail
+
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+ mov tmp, vec ;Preset to last vector
+
+loop_1byte:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec ;Back to last vector
+ cmp tmp2.b, 0
+ jne return_fail
+ sub len, 1
+ test len, (8-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne xor_gen_byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time
+ mov tmp, vec ;Preset to last vector
+
+ ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_Nbytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_Nbytes ;Loop for each source
+
+ mov tmp, vec ;Back to last vector
+ cmp tmp2, 0
+ jne return_fail
+ sub len, PS
+ sub tmp3, PS
+ jg loopN_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_check_sse, 00, 03, 0031
+
diff --git a/src/isa-l/raid/xor_check_test.c b/src/isa-l/raid/xor_check_test.c
new file mode 100644
index 000000000..c7532076f
--- /dev/null
+++ b/src/isa-l/raid/xor_check_test.c
@@ -0,0 +1,280 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 1];
+ char c;
+ int serr, lerr;
+ char *tmp_buf[TEST_SOURCES + 1];
+
+ printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 16, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("\nfail zero test %d\n", ret);
+ }
+
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test %d\n", ret);
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ ((char *)buffs[j])[i] = 0x5; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = 0; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) {
+ fail++;
+ printf("fail first rand test %d\n", ret);
+ }
+
+ c = ((char *)(buffs[0]))[TEST_LEN - 2];
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1;
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) {
+ fail++;
+ printf("\nFail corrupt buffer test, passed when should have failed\n");
+ }
+ ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer
+
+ // Test corrupted buffer any location on all sources w/ random data
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ for (i = TEST_LEN - 1; i >= 0; i--) {
+ // Check it still passes
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret != 0) { // should pass
+ fail++;
+ printf
+ ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n",
+ j, i);
+ return 1;
+ }
+ c = ((char *)buffs[j])[i];
+ ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ if (ret == 0) { // Check it now fails
+ fail++;
+ printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i);
+ return 1;
+ }
+ ((char *)buffs[j])[i] = c; // un-corrupt buffer
+ }
+ putchar('.');
+ }
+
+ // Test various number of sources, full length
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ // New random data
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ // Generate xor parity for this number of sources
+ xor_gen_base(j, TEST_LEN, buffs);
+
+ // Set errors up in each source and len position
+ for (i = 0; i < j; i++) {
+ for (k = 0; k < TEST_LEN; k++) {
+ // See if it still passes
+ ret = xor_check(j, TEST_LEN, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[i])[k];
+ ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer
+
+ ret = xor_check(j, TEST_LEN, buffs);
+ if (ret == 0) { // Should fail
+ printf
+ ("\nfail rand test corrupted buffer %d sources\n",
+ j);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[i])[k] = c; // un-corrupt buffer
+ }
+ }
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 1;
+ while (k <= TEST_LEN) {
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ // Generate xor parity for this number of sources
+ xor_gen_base(j, k, buffs);
+
+ // Inject errors at various source and len positions
+ for (lerr = 0; lerr < k; lerr += 10) {
+ for (serr = 0; serr < j; serr++) {
+
+ // See if it still passes
+ ret = xor_check(j, k, buffs);
+ if (ret != 0) { // Should pass
+ printf("\nfail rand test %d sources\n", j);
+ fail++;
+ return 1;
+ }
+
+ c = ((char *)buffs[serr])[lerr];
+ ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer
+
+ ret = xor_check(j, k, buffs);
+ if (ret == 0) { // Should fail
+ printf("\nfail rand test corrupted buffer "
+ "%d sources, len=%d, ret=%d\n", j, k,
+ ret);
+ fail++;
+ return 1;
+ }
+ ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer
+ }
+ }
+ }
+ putchar('.');
+ fflush(0);
+ k += 1;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 32) {
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ rand_buffer(buffs[j], TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+ // Test good data
+ ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ if (ret != 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ fail++;
+ return 1;
+ }
+ // Test bad data
+ for (serr = 0; serr < TEST_SOURCES + 1; serr++) {
+ for (lerr = 0; lerr < (TEST_LEN - i); lerr++) {
+ c = tmp_buf[serr][lerr];
+ tmp_buf[serr][lerr] = c ^ 1;
+
+ ret =
+ xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ if (ret == 0) {
+ printf("fail end test corrupted buffer - "
+ "offset: %d, len: %d, ret: %d\n", i,
+ TEST_LEN - i, ret);
+ fail++;
+ return 1;
+ }
+
+ tmp_buf[serr][lerr] = c;
+ }
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (fail == 0)
+ printf("Pass\n");
+
+ return fail;
+
+}
diff --git a/src/isa-l/raid/xor_example.c b/src/isa-l/raid/xor_example.c
new file mode 100644
index 000000000..48145ac90
--- /dev/null
+++ b/src/isa-l/raid/xor_example.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 16*1024
+
+int main(int argc, char *argv[])
+{
+ int i, j, should_pass, should_fail;
+ void *buffs[TEST_SOURCES + 1];
+
+ printf("XOR example\n");
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ if (posix_memalign(&buf, 32, TEST_LEN)) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ printf("Make random data\n");
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ for (j = 0; j < TEST_LEN; j++)
+ ((char *)buffs[i])[j] = rand();
+
+ printf("Generate xor parity\n");
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ printf("Check parity: ");
+ should_pass = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs);
+ printf("%s\n", should_pass == 0 ? "Pass" : "Fail");
+
+ printf("Find corruption: ");
+ ((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1; // flip one bit
+ should_fail = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); //recheck
+ printf("%s\n", should_fail != 0 ? "Pass" : "Fail");
+
+ return 0;
+}
diff --git a/src/isa-l/raid/xor_gen_avx.asm b/src/isa-l/raid/xor_gen_avx.asm
new file mode 100644
index 000000000..b5527b204
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx.asm
@@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX
+;;; int xor_gen_avx(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define func(x) x: endbranch
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size 2*32 + 8 ;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*32], ymm6
+ vmovdqu [rsp + 1*32], ymm7
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ vmovdqu ymm6, [rsp + 0*32]
+ vmovdqu ymm7, [rsp + 1*32]
+ add rsp, stack_size
+ %endmacro
+
+%endif ;output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+mk_global xor_gen_avx, function
+func(xor_gen_avx)
+
+ FUNC_SAVE
+ sub vec, 2 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+
+loop128:
+ mov tmp, vec ;Back to last vector
+ mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR ymm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR ymm1, [tmp2+pos+32] ;Keep xor parity in xmm0-7
+ XLDR ymm2, [tmp2+pos+(2*32)]
+ XLDR ymm3, [tmp2+pos+(3*32)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ XLDR ymm4, [ptr+pos] ;Get next vector (source)
+ XLDR ymm5, [ptr+pos+32]
+ XLDR ymm6, [ptr+pos+(2*32)]
+ XLDR ymm7, [ptr+pos+(3*32)]
+ vxorpd ymm0, ymm0, ymm4 ;Add to xor parity
+ vxorpd ymm1, ymm1, ymm5
+ vxorpd ymm2, ymm2, ymm6
+ vxorpd ymm3, ymm3, ymm7
+ jge next_vect ;Loop for each source
+
+ mov ptr, [arg2+PS+vec*PS] ;Address of parity vector
+ XSTR [ptr+pos], ymm0 ;Write parity xor vector
+ XSTR [ptr+pos+(1*32)], ymm1
+ XSTR [ptr+pos+(2*32)], ymm2
+ XSTR [ptr+pos+(3*32)], ymm3
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (PS-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne loop_1byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 8 at a time
+
+ ;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_8bytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_8bytes ;Loop for each source
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loop8_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ FUNC_RESTORE
+ mov return, 1
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_gen_avx, 02, 05, 0037
+
diff --git a/src/isa-l/raid/xor_gen_avx512.asm b/src/isa-l/raid/xor_gen_avx512.asm
new file mode 100644
index 000000000..5b078682a
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_avx512.asm
@@ -0,0 +1,217 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using AVX512
+;;; int xor_gen_avx512(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 32 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp3 arg4
+ %define func(x) x: endbranch
+ %define return rax
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define tmp r11
+ %define tmp3 r10
+ %define func(x) proc_frame x
+ %define return rax
+ %define stack_size 2*16 + 8 ;must be an odd multiple of 8
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*316]
+ add rsp, stack_size
+ %endmacro
+
+%endif ;output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define tmp2 rax
+%define tmp2.b al
+%define pos tmp3
+%define PS 8
+
+%define NO_NT_LDST
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+mk_global xor_gen_avx512, function
+func(xor_gen_avx512)
+ FUNC_SAVE
+ sub vec, 2 ;Keep as offset to last source
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+
+loop128:
+ mov tmp, vec ;Back to last vector
+ mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR zmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR zmm1, [tmp2+pos+64] ;Keep xor parity in xmm0-7
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ XLDR zmm4, [ptr+pos] ;Get next vector (source)
+ XLDR zmm5, [ptr+pos+64]
+ vpxorq zmm0, zmm0, zmm4 ;Add to xor parity
+ vpxorq zmm1, zmm1, zmm5
+ jge next_vect ;Loop for each source
+
+ mov ptr, [arg2+PS+vec*PS] ;Address of parity vector
+ XSTR [ptr+pos], zmm0 ;Write parity xor vector
+ XSTR [ptr+pos+64], zmm1
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ FUNC_RESTORE
+ mov return, 0
+ ret
+
+
+;;; Do one byte at a time for no alignment case
+loop_1byte:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (PS-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne loop_1byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 8 at a time
+
+ ;; Run backwards 8 bytes at a time for (tmp3) bytes
+loop8_bytes:
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+vec*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_8bytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_8bytes ;Loop for each source
+
+ mov tmp, vec
+ add tmp, 1 ;Add back to point to last vec
+ mov ptr, [arg2+tmp*PS]
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loop8_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ FUNC_RESTORE
+ mov return, 1
+ ret
+
+endproc_frame
+
+%endif ; ifdef HAVE_AS_KNOWS_AVX512
diff --git a/src/isa-l/raid/xor_gen_perf.c b/src/isa-l/raid/xor_gen_perf.c
new file mode 100644
index 000000000..717e0ada7
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_perf.c
@@ -0,0 +1,90 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include "raid.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_SOURCES 10
+# define TEST_LEN 8*1024
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_SOURCES 10
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN GT_L3_CACHE / TEST_SOURCES
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+
+int main(int argc, char *argv[])
+{
+ int i, ret, fail = 0;
+ void **buffs;
+ void *buff;
+ struct perf start;
+
+ printf("Test xor_gen_perf\n");
+
+ ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs = buff;
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 64, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Setup data
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ BENCHMARK(&start, BENCHMARK_TIME, xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs));
+ printf("xor_gen" TEST_TYPE_STR ": ");
+ perf_print(start, (long long)TEST_MEM);
+
+ return fail;
+}
diff --git a/src/isa-l/raid/xor_gen_sse.asm b/src/isa-l/raid/xor_gen_sse.asm
new file mode 100644
index 000000000..f31ae63e4
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_sse.asm
@@ -0,0 +1,284 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Optimized xor of N source vectors using SSE
+;;; int xor_gen_sse(int vects, int len, void **array)
+
+;;; Generates xor parity vector from N (vects-1) sources in array of pointers
+;;; (**array). Last pointer is the dest.
+;;; Vectors must be aligned to 16 bytes. Length can be any value.
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+ %define tmp r11
+ %define tmp2 rax
+ %define tmp2.b al
+ %define tmp3 arg4
+ %define return rax
+ %define PS 8
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define return rax
+ %define tmp2 rax
+ %define tmp2.b al
+ %define PS 8
+ %define tmp r11
+ %define tmp3 r10
+ %define stack_size 2*16 + 8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ add rsp, stack_size
+ %endmacro
+
+
+%elifidn __OUTPUT_FORMAT__, elf32
+ %define arg0 arg(0)
+ %define arg1 ecx
+ %define tmp2 eax
+ %define tmp2.b al
+ %define tmp3 edx
+ %define return eax
+ %define PS 4
+ %define func(x) x: endbranch
+ %define arg(x) [ebp+8+PS*x]
+ %define arg2 edi ; must sav/restore
+ %define arg3 esi
+ %define tmp ebx
+
+ %macro FUNC_SAVE 0
+ push ebp
+ mov ebp, esp
+ push esi
+ push edi
+ push ebx
+ mov arg1, arg(1)
+ mov arg2, arg(2)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop ebx
+ pop edi
+ pop esi
+ mov esp, ebp ;if has frame pointer
+ pop ebp
+ %endmacro
+
+%endif ; output formats
+
+
+%define vec arg0
+%define len arg1
+%define ptr arg3
+%define pos tmp3
+
+%ifidn PS,8 ; 64-bit code
+ default rel
+ [bits 64]
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+section .text
+
+align 16
+mk_global xor_gen_sse, function
+func(xor_gen_sse)
+ FUNC_SAVE
+%ifidn PS,8 ;64-bit code
+ sub vec, 2 ; Keep as offset to last source
+%else ;32-bit code
+ mov tmp, arg(0) ; Update vec length arg to last source
+ sub tmp, 2
+ mov arg(0), tmp
+%endif
+
+ jng return_fail ;Must have at least 2 sources
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;Check alignment of length
+ jnz len_not_aligned
+
+
+len_aligned_128bytes:
+ sub len, 128
+ mov pos, 0
+ mov tmp, vec ;Preset to last vector
+
+loop128:
+ mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array
+ sub tmp, 1 ;Next vect
+ XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector
+ XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7
+ XLDR xmm2, [tmp2+pos+(2*16)]
+ XLDR xmm3, [tmp2+pos+(3*16)]
+ XLDR xmm4, [tmp2+pos+(4*16)]
+ XLDR xmm5, [tmp2+pos+(5*16)]
+ XLDR xmm6, [tmp2+pos+(6*16)]
+ XLDR xmm7, [tmp2+pos+(7*16)]
+
+next_vect:
+ mov ptr, [arg2+tmp*PS]
+ sub tmp, 1
+ xorpd xmm0, [ptr+pos] ;Get next vector (source)
+ xorpd xmm1, [ptr+pos+16]
+ xorpd xmm2, [ptr+pos+(2*16)]
+ xorpd xmm3, [ptr+pos+(3*16)]
+ xorpd xmm4, [ptr+pos+(4*16)]
+ xorpd xmm5, [ptr+pos+(5*16)]
+ xorpd xmm6, [ptr+pos+(6*16)]
+ xorpd xmm7, [ptr+pos+(7*16)]
+;;; prefetch [ptr+pos+(8*16)]
+ jge next_vect ;Loop for each vect
+
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Address of parity vector
+ XSTR [ptr+pos], xmm0 ;Write parity xor vector
+ XSTR [ptr+pos+(1*16)], xmm1
+ XSTR [ptr+pos+(2*16)], xmm2
+ XSTR [ptr+pos+(3*16)], xmm3
+ XSTR [ptr+pos+(4*16)], xmm4
+ XSTR [ptr+pos+(5*16)], xmm5
+ XSTR [ptr+pos+(6*16)], xmm6
+ XSTR [ptr+pos+(7*16)], xmm7
+ add pos, 128
+ cmp pos, len
+ jle loop128
+
+return_pass:
+ mov return, 0
+ FUNC_RESTORE
+ ret
+
+
+
+;;; Do one byte at a time for no alignment case
+
+xor_gen_byte:
+ mov tmp, vec ;Preset to last vector
+
+loop_1byte:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2.b, [ptr+len-1] ;Get array n
+ sub tmp, 1
+nextvect_1byte:
+ mov ptr, [arg2+tmp*PS]
+ xor tmp2.b, [ptr+len-1]
+ sub tmp, 1
+ jge nextvect_1byte
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Get last vec
+ mov [ptr+len-1], tmp2.b ;Write parity
+ sub len, 1
+ test len, (8-1)
+ jnz loop_1byte
+
+ cmp len, 0
+ je return_pass
+ test len, (128-1) ;If not 0 and 128bit aligned
+ jz len_aligned_128bytes ; then do aligned case. len = y * 128
+
+ ;; else we are 8-byte aligned so fall through to recheck
+
+
+ ;; Unaligned length cases
+len_not_aligned:
+ test len, (PS-1)
+ jne xor_gen_byte
+ mov tmp3, len
+ and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time
+ mov tmp, vec ;Preset to last vector
+
+ ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes
+loopN_bytes:
+ mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array
+ mov tmp2, [ptr+len-PS] ;Get array n
+ sub tmp, 1
+nextvect_Nbytes:
+ mov ptr, [arg2+tmp*PS] ;Get pointer to next vector
+ xor tmp2, [ptr+len-PS]
+ sub tmp, 1
+ jge nextvect_Nbytes ;Loop for each source
+
+ mov tmp, vec ;Back to last vector
+ mov ptr, [arg2+PS+tmp*PS] ;Get last vec
+ mov [ptr+len-PS], tmp2 ;Write parity
+ sub len, PS
+ sub tmp3, PS
+ jg loopN_bytes
+
+ cmp len, 128 ;Now len is aligned to 128B
+ jge len_aligned_128bytes ;We can do the rest aligned
+
+ cmp len, 0
+ je return_pass
+
+return_fail:
+ mov return, 1
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+
+;;; func core, ver, snum
+slversion xor_gen_sse, 00, 0c, 0030
+
diff --git a/src/isa-l/raid/xor_gen_test.c b/src/isa-l/raid/xor_gen_test.c
new file mode 100644
index 000000000..ee922bfaf
--- /dev/null
+++ b/src/isa-l/raid/xor_gen_test.c
@@ -0,0 +1,165 @@
+/**********************************************************************
+ Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include<stdio.h>
+#include<stdint.h>
+#include<string.h>
+#include<stdlib.h>
+#include "raid.h"
+#include "types.h"
+
+#define TEST_SOURCES 16
+#define TEST_LEN 1024
+#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN))
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k, ret, fail = 0;
+ void *buffs[TEST_SOURCES + 1];
+ char *tmp_buf[TEST_SOURCES + 1];
+
+ printf("Test xor_gen_test ");
+
+ srand(TEST_SEED);
+
+ // Allocate the arrays
+ for (i = 0; i < TEST_SOURCES + 1; i++) {
+ void *buf;
+ ret = posix_memalign(&buf, 32, TEST_LEN);
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ buffs[i] = buf;
+ }
+
+ // Test of all zeros
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ memset(buffs[i], 0, TEST_LEN);
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ for (i = 0; i < TEST_LEN; i++) {
+ if (((char *)buffs[TEST_SOURCES])[i] != 0)
+ fail++;
+ }
+
+ if (fail > 0) {
+ printf("fail zero test");
+ return 1;
+ } else
+ putchar('.');
+
+ // Test rand1
+ for (i = 0; i < TEST_SOURCES + 1; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d\n", fail);
+ return 1;
+ } else
+ putchar('.');
+
+ // Test various number of sources
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], TEST_LEN);
+
+ xor_gen(j, TEST_LEN, buffs);
+ fail |= xor_check_base(j, TEST_LEN, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources\n", j);
+ return 1;
+ } else
+ putchar('.');
+ }
+
+ fflush(0);
+
+ // Test various number of sources and len
+ k = 0;
+ while (k <= TEST_LEN) {
+ for (j = 3; j <= TEST_SOURCES + 1; j++) {
+ for (i = 0; i < j; i++)
+ rand_buffer(buffs[i], k);
+
+ xor_gen(j, k, buffs);
+ fail |= xor_check_base(j, k, buffs);
+
+ if (fail > 0) {
+ printf("fail rand test %d sources, len=%d, ret=%d\n", j, k,
+ fail);
+ return 1;
+ }
+ }
+ putchar('.');
+ k += 1;
+ }
+
+ // Test at the end of buffer
+ for (i = 0; i < TEST_LEN; i += 32) {
+ for (j = 0; j < TEST_SOURCES + 1; j++) {
+ rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i);
+ tmp_buf[j] = (char *)buffs[j] + i;
+ }
+
+ xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+ fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf);
+
+ if (fail > 0) {
+ printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i);
+ return 1;
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ if (!fail)
+ printf(" done: Pass\n");
+
+ return fail;
+}