diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/isa-l/raid | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
30 files changed, 5781 insertions, 0 deletions
diff --git a/src/isa-l/raid/Makefile.am b/src/isa-l/raid/Makefile.am new file mode 100644 index 000000000..5f98668d5 --- /dev/null +++ b/src/isa-l/raid/Makefile.am @@ -0,0 +1,67 @@ +######################################################################## +# Copyright(c) 2011-2015 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +include raid/aarch64/Makefile.am + +lsrc += raid/raid_base.c + +lsrc_base_aliases += raid/raid_base_aliases.c +lsrc_ppc64le += raid/raid_base_aliases.c + +lsrc_x86_64 += \ + raid/xor_gen_sse.asm \ + raid/pq_gen_sse.asm \ + raid/xor_check_sse.asm \ + raid/pq_check_sse.asm \ + raid/pq_gen_avx.asm \ + raid/xor_gen_avx.asm \ + raid/pq_gen_avx2.asm \ + raid/xor_gen_avx512.asm \ + raid/pq_gen_avx512.asm \ + raid/raid_multibinary.asm + +lsrc_x86_32 += \ + raid/xor_gen_sse.asm \ + raid/pq_gen_sse_i32.asm \ + raid/xor_check_sse.asm \ + raid/pq_check_sse_i32.asm \ + raid/raid_multibinary_i32.asm + + +extern_hdrs += include/raid.h + +other_src += include/test.h include/types.h + +check_tests += raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test + +perf_tests += raid/xor_gen_perf raid/pq_gen_perf + +examples += raid/xor_example + +lsrc32 += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c diff --git a/src/isa-l/raid/aarch64/Makefile.am b/src/isa-l/raid/aarch64/Makefile.am new file mode 100644 index 000000000..d08c8d67a --- /dev/null +++ b/src/isa-l/raid/aarch64/Makefile.am @@ -0,0 +1,36 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +lsrc_aarch64 += \ + raid/aarch64/xor_gen_neon.S \ + raid/aarch64/pq_gen_neon.S \ + raid/aarch64/xor_check_neon.S \ + raid/aarch64/pq_check_neon.S \ + raid/aarch64/raid_multibinary_arm.S \ + raid/aarch64/raid_aarch64_dispatcher.c diff --git a/src/isa-l/raid/aarch64/pq_check_neon.S b/src/isa-l/raid/aarch64/pq_check_neon.S new file mode 100644 index 000000000..55ad79829 --- /dev/null +++ b/src/isa-l/raid/aarch64/pq_check_neon.S @@ -0,0 +1,341 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +.text + +.global pq_check_neon +.type pq_check_neon, %function + +/* int pq_check_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 3 */ +x_vects .req x0 +w_len .req w1 /* MUST be 16x bytes */ +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dst_p .req x3 +x_dst_q .req x4 +x_dst_q_end .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_src_ptr_end .req x9 +x_src_last .req x10 +x_srcn .req x11 +w_min .req w12 +/* vectors */ +/* v0 ~ v7 : temporary p */ +/* v8 ~ v15: temporary q */ +/* v16 ~ v23: next 128 bytes */ +v_mask0 .req v24 +v_mask1 .req v25 +v_mask2 .req v26 +v_mask3 .req v27 +v_gf8poly .req v28 +v_0x80 .req v29 + +/* + * src_ptr_end --> + * -------+----------+ + * . | src[0] | + * . +----------+ +------------------+ + * src_ptr --> | src[1] | - srcn -> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-4] | + * -------+----------+ src_last +------------------+ + * src --> | src[v-3] | ---------> | buffer | + * +----------+ +------------------+ + * | src[v-2] | - dst_p -> | buffer | + * +----------+ +------------------+ + * | src[v-1] | - dst_q -> | buffer | dst_q_end + * +----------+ +------------------+ + */ + +pq_check_neon: + sub x_src_ptr_end, x_src, #8 + + sub w_vects, w_vects, #3 + add x_src, x_src, x_vects, lsl #3 + + ldr x_src_last, [x_src] + ldp x_dst_p, x_dst_q, [x_src, #8] + + add x_dst_q_end, x_dst_q, x_len + + mov w_min, #-1 + mov w_col, #0 + movi v_gf8poly.16b, #0x1D + movi v_0x80.16b, #0x80 + +.Lloop128_init: + /* less than 128 byts? */ + cmp w_len, #128 + blo .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_dst_q_end, x_dst_q_end, #128 + + /* batch process (vects-2)*128 bytes */ + /* v0~v7: p; v8~v15: q; v16~v23: in */ +.Lloop128: + ldr q0, [x_src_last, #16*0] + ldr q1, [x_src_last, #16*1] + ldr q2, [x_src_last, #16*2] + ldr q3, [x_src_last, #16*3] + ldr q4, [x_src_last, #16*4] + ldr q5, [x_src_last, #16*5] + ldr q6, [x_src_last, #16*6] + ldr q7, [x_src_last, #16*7] + add x_src_last, x_src_last, #128 + + mov v8.16b, v0.16b + mov v9.16b, v1.16b + mov v10.16b, v2.16b + mov v11.16b, v3.16b + mov v12.16b, v4.16b + mov v13.16b, v5.16b + mov v14.16b, v6.16b + mov v15.16b, v7.16b + + cbz w_vects, .Lloop128_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop128_vects: + ldr x_srcn, [x_src_ptr], #-8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + cmhs v_mask0.16b, v8.16b, v_0x80.16b + cmhs v_mask1.16b, v9.16b, v_0x80.16b + cmhs v_mask2.16b, v10.16b, v_0x80.16b + cmhs v_mask3.16b, v11.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v8.16b, v8.16b, #1 + shl v9.16b, v9.16b, #1 + shl v10.16b, v10.16b, #1 + shl v11.16b, v11.16b, #1 + eor v8.16b, v8.16b, v_mask0.16b + eor v9.16b, v9.16b, v_mask1.16b + eor v10.16b, v10.16b, v_mask2.16b + eor v11.16b, v11.16b, v_mask3.16b + eor v8.16b, v8.16b, v16.16b + eor v9.16b, v9.16b, v17.16b + eor v10.16b, v10.16b, v18.16b + eor v11.16b, v11.16b, v19.16b + + cmhs v_mask0.16b, v12.16b, v_0x80.16b + cmhs v_mask1.16b, v13.16b, v_0x80.16b + cmhs v_mask2.16b, v14.16b, v_0x80.16b + cmhs v_mask3.16b, v15.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v12.16b, v12.16b, #1 + shl v13.16b, v13.16b, #1 + shl v14.16b, v14.16b, #1 + shl v15.16b, v15.16b, #1 + eor v12.16b, v12.16b, v_mask0.16b + eor v13.16b, v13.16b, v_mask1.16b + eor v14.16b, v14.16b, v_mask2.16b + eor v15.16b, v15.16b, v_mask3.16b + eor v12.16b, v12.16b, v20.16b + eor v13.16b, v13.16b, v21.16b + eor v14.16b, v14.16b, v22.16b + eor v15.16b, v15.16b, v23.16b + + bne .Lloop128_vects + +.Lloop128_vects_end: + /* v16~v23: true p, q */ + ldr q16, [x_dst_p, #16*0] + ldr q17, [x_dst_p, #16*1] + ldr q18, [x_dst_p, #16*2] + ldr q19, [x_dst_p, #16*3] + ldr q20, [x_dst_p, #16*4] + ldr q21, [x_dst_p, #16*5] + ldr q22, [x_dst_p, #16*6] + ldr q23, [x_dst_p, #16*7] + + cmeq v0.16b, v0.16b, v16.16b + cmeq v1.16b, v1.16b, v17.16b + cmeq v2.16b, v2.16b, v18.16b + cmeq v3.16b, v3.16b, v19.16b + cmeq v4.16b, v4.16b, v20.16b + cmeq v5.16b, v5.16b, v21.16b + cmeq v6.16b, v6.16b, v22.16b + cmeq v7.16b, v7.16b, v23.16b + + ldr q16, [x_dst_q, #16*0] + ldr q17, [x_dst_q, #16*1] + ldr q18, [x_dst_q, #16*2] + ldr q19, [x_dst_q, #16*3] + ldr q20, [x_dst_q, #16*4] + ldr q21, [x_dst_q, #16*5] + ldr q22, [x_dst_q, #16*6] + ldr q23, [x_dst_q, #16*7] + + and v0.16b, v0.16b, v1.16b + and v2.16b, v2.16b, v3.16b + and v4.16b, v4.16b, v5.16b + and v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v2.16b + and v4.16b, v4.16b, v6.16b + and v0.16b, v0.16b, v4.16b + + cmeq v8.16b, v8.16b, v16.16b + cmeq v9.16b, v9.16b, v17.16b + cmeq v10.16b, v10.16b, v18.16b + cmeq v11.16b, v11.16b, v19.16b + cmeq v12.16b, v12.16b, v20.16b + cmeq v13.16b, v13.16b, v21.16b + cmeq v14.16b, v14.16b, v22.16b + cmeq v15.16b, v15.16b, v23.16b + + and v8.16b, v8.16b, v9.16b + and v10.16b, v10.16b, v11.16b + and v12.16b, v12.16b, v13.16b + and v14.16b, v14.16b, v15.16b + and v8.16b, v8.16b, v10.16b + and v12.16b, v12.16b, v14.16b + and v8.16b, v8.16b, v12.16b + + and v0.16b, v0.16b, v8.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lloop128_end + + add x_dst_p, x_dst_p, #128 + add x_dst_q, x_dst_q, #128 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #128 + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + cbz w_min, .Lerror + + add x_dst_q_end, x_dst_q_end, #128 + +.Lloop16_init: + tst w_len, #0x7F + beq .Lloop16_end + sub x_dst_q_end, x_dst_q_end, #16 + + /* batch process (vects-2)*16 bytes */ + /* v0: p; v1: q; v2: in; v3: mask */ +.Lloop16: + ldr q0, [x_src_last], #16 + mov v1.16b, v0.16b + + cbz w_vects, .Lloop16_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #-8 + ldr q2, [x_srcn, x_col] + cmp x_src_ptr, x_src_ptr_end + + eor v0.16b, v0.16b, v2.16b + + cmhs v3.16b, v1.16b, v_0x80.16b + and v3.16b, v3.16b, v_gf8poly.16b + + shl v1.16b, v1.16b, #1 + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + + bne .Lloop16_vects + +.Lloop16_vects_end: + /* v4: true p; v5: true q */ + ldr q4, [x_dst_p], #16 + ldr q5, [x_dst_q], #16 + cmp x_dst_q, x_dst_q_end + + cmeq v0.16b, v0.16b, v4.16b + cmeq v1.16b, v1.16b, v5.16b + and v0.16b, v0.16b, v1.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lerror + + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + mov w_ret, #0 + ret + +.Lerror: + mov w_ret, #1 + ret diff --git a/src/isa-l/raid/aarch64/pq_gen_neon.S b/src/isa-l/raid/aarch64/pq_gen_neon.S new file mode 100644 index 000000000..f60ad1211 --- /dev/null +++ b/src/isa-l/raid/aarch64/pq_gen_neon.S @@ -0,0 +1,282 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +.text + +.global pq_gen_neon +.type pq_gen_neon, %function + +/* int pq_gen_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 3 */ +x_vects .req x0 +w_len .req w1 /* MUST be 16x bytes */ +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dst_p .req x3 +x_dst_q .req x4 +x_dst_q_end .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_src_ptr_end .req x9 +x_src_last .req x10 +x_srcn .req x11 +/* vectors */ +/* v0 ~ v7 : temporary p */ +/* v8 ~ v15: temporary q */ +/* v16 ~ v23: next 128 bytes */ +v_mask0 .req v24 +v_mask1 .req v25 +v_mask2 .req v26 +v_mask3 .req v27 +v_gf8poly .req v28 +v_0x80 .req v29 + +/* + * src_ptr_end --> + * -------+----------+ + * . | src[0] | + * . +----------+ +------------------+ + * src_ptr --> | src[1] | - srcn -> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-4] | + * -------+----------+ src_last +------------------+ + * src --> | src[v-3] | ---------> | buffer | + * +----------+ +------------------+ + * | src[v-2] | - dst_p -> | buffer | + * +----------+ +------------------+ + * | src[v-1] | - dst_q -> | buffer | dst_q_end + * +----------+ +------------------+ + */ + +pq_gen_neon: + sub x_src_ptr_end, x_src, #8 + + sub w_vects, w_vects, #3 + add x_src, x_src, x_vects, lsl #3 + + ldr x_src_last, [x_src] + ldp x_dst_p, x_dst_q, [x_src, #8] + + add x_dst_q_end, x_dst_q, x_len + + mov w_col, #0 + movi v_gf8poly.16b, #0x1D + movi v_0x80.16b, #0x80 + +.Lloop128_init: + /* less than 128 byts? */ + cmp w_len, #128 + blo .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_dst_q_end, x_dst_q_end, #128 + + /* batch process (vects-2)*128 bytes */ + /* v0~v7: p; v8~v15: q; v16~v23: in */ +.Lloop128: + ldr q0, [x_src_last, #16*0] + ldr q1, [x_src_last, #16*1] + ldr q2, [x_src_last, #16*2] + ldr q3, [x_src_last, #16*3] + ldr q4, [x_src_last, #16*4] + ldr q5, [x_src_last, #16*5] + ldr q6, [x_src_last, #16*6] + ldr q7, [x_src_last, #16*7] + add x_src_last, x_src_last, #128 + + mov v8.16b, v0.16b + mov v9.16b, v1.16b + mov v10.16b, v2.16b + mov v11.16b, v3.16b + mov v12.16b, v4.16b + mov v13.16b, v5.16b + mov v14.16b, v6.16b + mov v15.16b, v7.16b + + cbz w_vects, .Lloop128_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop128_vects: + ldr x_srcn, [x_src_ptr], #-8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + cmhs v_mask0.16b, v8.16b, v_0x80.16b + cmhs v_mask1.16b, v9.16b, v_0x80.16b + cmhs v_mask2.16b, v10.16b, v_0x80.16b + cmhs v_mask3.16b, v11.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v8.16b, v8.16b, #1 + shl v9.16b, v9.16b, #1 + shl v10.16b, v10.16b, #1 + shl v11.16b, v11.16b, #1 + eor v8.16b, v8.16b, v_mask0.16b + eor v9.16b, v9.16b, v_mask1.16b + eor v10.16b, v10.16b, v_mask2.16b + eor v11.16b, v11.16b, v_mask3.16b + eor v8.16b, v8.16b, v16.16b + eor v9.16b, v9.16b, v17.16b + eor v10.16b, v10.16b, v18.16b + eor v11.16b, v11.16b, v19.16b + + cmhs v_mask0.16b, v12.16b, v_0x80.16b + cmhs v_mask1.16b, v13.16b, v_0x80.16b + cmhs v_mask2.16b, v14.16b, v_0x80.16b + cmhs v_mask3.16b, v15.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v12.16b, v12.16b, #1 + shl v13.16b, v13.16b, #1 + shl v14.16b, v14.16b, #1 + shl v15.16b, v15.16b, #1 + eor v12.16b, v12.16b, v_mask0.16b + eor v13.16b, v13.16b, v_mask1.16b + eor v14.16b, v14.16b, v_mask2.16b + eor v15.16b, v15.16b, v_mask3.16b + eor v12.16b, v12.16b, v20.16b + eor v13.16b, v13.16b, v21.16b + eor v14.16b, v14.16b, v22.16b + eor v15.16b, v15.16b, v23.16b + + bne .Lloop128_vects + +.Lloop128_vects_end: + str q0, [x_dst_p, #16*0] + str q1, [x_dst_p, #16*1] + str q2, [x_dst_p, #16*2] + str q3, [x_dst_p, #16*3] + str q4, [x_dst_p, #16*4] + str q5, [x_dst_p, #16*5] + str q6, [x_dst_p, #16*6] + str q7, [x_dst_p, #16*7] + + str q8, [x_dst_q, #16*0] + str q9, [x_dst_q, #16*1] + str q10, [x_dst_q, #16*2] + str q11, [x_dst_q, #16*3] + str q12, [x_dst_q, #16*4] + str q13, [x_dst_q, #16*5] + str q14, [x_dst_q, #16*6] + str q15, [x_dst_q, #16*7] + + add x_dst_p, x_dst_p, #128 + add x_dst_q, x_dst_q, #128 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #128 + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_dst_q_end, x_dst_q_end, #128 + +.Lloop16_init: + tst w_len, #0x7F + beq .Lloop16_end + sub x_dst_q_end, x_dst_q_end, #16 + + /* batch process (vects-2)*16 bytes */ + /* v0: p; v1: q; v2: in; v3: mask */ +.Lloop16: + ldr q0, [x_src_last], #16 + mov v1.16b, v0.16b + + cbz w_vects, .Lloop16_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #-8 + ldr q2, [x_srcn, x_col] + cmp x_src_ptr, x_src_ptr_end + + eor v0.16b, v0.16b, v2.16b + + cmhs v3.16b, v1.16b, v_0x80.16b + and v3.16b, v3.16b, v_gf8poly.16b + + shl v1.16b, v1.16b, #1 + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + + bne .Lloop16_vects + +.Lloop16_vects_end: + str q0, [x_dst_p], #16 + str q1, [x_dst_q], #16 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + mov w_ret, #0 + ret diff --git a/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c new file mode 100644 index 000000000..c81bd8c98 --- /dev/null +++ b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c @@ -0,0 +1,61 @@ +/********************************************************************** + Copyright(c) 2019 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(xor_gen) +{ + if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + return PROVIDER_INFO(xor_gen_neon); + return PROVIDER_BASIC(xor_gen); + +} + +DEFINE_INTERFACE_DISPATCHER(xor_check) +{ + if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + return PROVIDER_INFO(xor_check_neon); + return PROVIDER_BASIC(xor_check); + +} + +DEFINE_INTERFACE_DISPATCHER(pq_gen) +{ + if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + return PROVIDER_INFO(pq_gen_neon); + return PROVIDER_BASIC(pq_gen); + +} + +DEFINE_INTERFACE_DISPATCHER(pq_check) +{ + if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + return PROVIDER_INFO(pq_check_neon); + return PROVIDER_BASIC(pq_check); + +} diff --git a/src/isa-l/raid/aarch64/raid_multibinary_arm.S b/src/isa-l/raid/aarch64/raid_multibinary_arm.S new file mode 100644 index 000000000..0316239ec --- /dev/null +++ b/src/isa-l/raid/aarch64/raid_multibinary_arm.S @@ -0,0 +1,36 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +#include "aarch64_multibinary.h" + + +mbin_interface xor_gen +mbin_interface xor_check +mbin_interface pq_gen +mbin_interface pq_check diff --git a/src/isa-l/raid/aarch64/xor_check_neon.S b/src/isa-l/raid/aarch64/xor_check_neon.S new file mode 100644 index 000000000..95cb7d1d1 --- /dev/null +++ b/src/isa-l/raid/aarch64/xor_check_neon.S @@ -0,0 +1,271 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +.text + +.global xor_check_neon +.type xor_check_neon, %function + +/* int xor_check_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 2 */ +x_vects .req x0 +w_len .req w1 +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +w_in .req w1 /* share w_len */ +x_src0 .req x3 +x_src0_end .req x4 +w_len256 .req w5 /* share w_len16 */ +x_len256 .req x5 +w_len16 .req w5 +x_len16 .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_srcn .req x9 +x_src_ptr_end .req x10 +w_xor .req w11 +/* v0 ~ v15: temporary results */ +/* v16 ~ v31: next 256 bytes */ + +/* + * +----------+ +------------------+ + * src --> | src[0] | - src0 -> | buffer | src0_end + * --------+----------+ +------------------+ + * . | ...... | + * . +----------+ +------------------+ + * src_ptr ~~> | src[n] | - srcn ~> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-1] | + * --------+----------+ + * src_ptr_end --> + */ + +xor_check_neon: + add x_src_ptr_end, x_src, x_vects, lsl #3 + ldr x_src0, [x_src] + add x_src0_end, x_src0, x_len + + sub w_vects, w_vects, #1 + mov w_col, #0 + mov w_xor, #0 + +.Lloop256_init: + /* len256 = len - len%256; len %= 256 */ + mov w_len256, w_len + and w_len, w_len, #0xFF + sub w_len256, w_len256, w_len + + /* less than 256 byts? */ + cbz w_len256, .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src0_end, x_src0_end, #256 + + /* batch process vects*256 bytes */ +.Lloop256: + ldr q0, [x_src0, #16*0] + ldr q1, [x_src0, #16*1] + ldr q2, [x_src0, #16*2] + ldr q3, [x_src0, #16*3] + ldr q4, [x_src0, #16*4] + ldr q5, [x_src0, #16*5] + ldr q6, [x_src0, #16*6] + ldr q7, [x_src0, #16*7] + ldr q8, [x_src0, #16*8] + ldr q9, [x_src0, #16*9] + ldr q10, [x_src0, #16*10] + ldr q11, [x_src0, #16*11] + ldr q12, [x_src0, #16*12] + ldr q13, [x_src0, #16*13] + ldr q14, [x_src0, #16*14] + ldr q15, [x_src0, #16*15] + add x_src0, x_src0, #256 + + cbz w_vects, .Lloop256_vects_end + + add x_src_ptr, x_src, #8 +.Lloop256_vects: + ldr x_srcn, [x_src_ptr], #8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + ldr q24, [x_srcn, #16*8] + ldr q25, [x_srcn, #16*9] + ldr q26, [x_srcn, #16*10] + ldr q27, [x_srcn, #16*11] + ldr q28, [x_srcn, #16*12] + ldr q29, [x_srcn, #16*13] + ldr q30, [x_srcn, #16*14] + ldr q31, [x_srcn, #16*15] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + eor v8.16b, v8.16b, v24.16b + eor v9.16b, v9.16b, v25.16b + eor v10.16b, v10.16b, v26.16b + eor v11.16b, v11.16b, v27.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + + bne .Lloop256_vects + +.Lloop256_vects_end: + orr v0.16b, v0.16b, v1.16b + orr v2.16b, v2.16b, v3.16b + orr v4.16b, v4.16b, v5.16b + orr v6.16b, v6.16b, v7.16b + orr v8.16b, v8.16b, v9.16b + orr v10.16b, v10.16b, v11.16b + orr v12.16b, v12.16b, v13.16b + orr v14.16b, v14.16b, v15.16b + orr v0.16b, v0.16b, v2.16b + orr v4.16b, v4.16b, v6.16b + orr v8.16b, v8.16b, v10.16b + orr v12.16b, v12.16b, v14.16b + orr v0.16b, v0.16b, v4.16b + orr v8.16b, v8.16b, v12.16b + orr v0.16b, v0.16b, v8.16b + umaxv b0, v0.16b + umov w_xor, v0.b[0] + cbnz w_xor, .Lloop256_end + + cmp x_src0, x_src0_end + add w_col, w_col, #256 + bls .Lloop256 + +.Lloop256_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + cbnz w_xor, .Lerror + + add x_src0_end, x_src0_end, #256 + +.Lloop16_init: + /* len16 = len - len%16; len %= 16 */ + mov w_len16, w_len + and w_len, w_len, #0xF + sub w_len16, w_len16, w_len + + /* less than 16 bytes? */ + cbz w_len16, .Lloop1_init + + sub x_src0_end, x_src0_end, #16 + + /* batch process vects*16 bytes */ +.Lloop16: + ldr q0, [x_src0], #16 + cbz w_vects, .Lloop16_vects_end + + add x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_src_ptr_end + ldr q1, [x_srcn, x_col] + eor v0.16b, v0.16b, v1.16b + bne .Lloop16_vects + +.Lloop16_vects_end: + umaxv b0, v0.16b + umov w_xor, v0.b[0] + cbnz w_xor, .Lerror + cmp x_src0, x_src0_end + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + add x_src0_end, x_src0_end, #16 + +.Lloop1_init: + cbnz w_len, .Lloop1 + mov w_ret, #0 + ret + + /* batch process vects*1 bytes */ +.Lloop1: + ldrb w_xor, [x_src0], #1 + cbz w_vects, .Lloop1_vects_end + + add x_src_ptr, x_src, #8 +.Lloop1_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_src_ptr_end + ldrb w_in, [x_srcn, x_col] + eor w_xor, w_xor, w_in + bne .Lloop1_vects + +.Lloop1_vects_end: + cbnz w_xor, .Lerror + cmp x_src0, x_src0_end + add w_col, w_col, #1 + bne .Lloop1 + +.Lloop1_end: + mov w_ret, #0 + ret + +.Lerror: + mov w_ret, #1 + ret diff --git a/src/isa-l/raid/aarch64/xor_gen_neon.S b/src/isa-l/raid/aarch64/xor_gen_neon.S new file mode 100644 index 000000000..00f65a2ef --- /dev/null +++ b/src/isa-l/raid/aarch64/xor_gen_neon.S @@ -0,0 +1,264 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +.text + +.global xor_gen_neon +.type xor_gen_neon, %function + +/* int xor_gen_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 2 */ +x_vects .req x0 +w_len .req w1 +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +w_in .req w1 /* share w_len */ +x_src0 .req x3 +x_src0_end .req x4 +w_len256 .req w5 /* share w_len16, w_xor */ +x_len256 .req x5 +w_len16 .req w5 +x_len16 .req x5 +w_xor .req w5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_srcn .req x9 +x_dst .req x10 +x_dst_ptr .req x11 +/* v0 ~ v15: temporary results */ +/* v16 ~ v31: next 256 bytes */ + +/* + * +----------+ +------------------+ + * src --> | src[0] | - src0 -> | buffer | src0_end + * --------+----------+ +------------------+ + * . | ...... | + * . +----------+ +------------------+ + * src_ptr ~~> | src[n] | - srcn ~> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-2] | + * --------+----------+ +------------------+ + * dst_ptr --> | src[v-1] | -- dst --> | buffer | + * +----------+ +------------------+ + */ + +xor_gen_neon: + add x_dst_ptr, x_src, x_vects, lsl #3 + ldr x_dst, [x_dst_ptr, #-8]! + ldr x_src0, [x_src] + add x_src0_end, x_src0, x_len + + sub w_vects, w_vects, #2 + mov w_col, #0 + +.Loop256_init: + /* len256 = len - len%256; len %= 256 */ + mov w_len256, w_len + and w_len, w_len, #0xFF + sub w_len256, w_len256, w_len + + /* less than 256 byts? */ + cbz w_len256, .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src0_end, x_src0_end, #256 + + /* batch process (vects-1)*256 bytes */ +.Lloop256: + ldr q0, [x_src0, #16*0] + ldr q1, [x_src0, #16*1] + ldr q2, [x_src0, #16*2] + ldr q3, [x_src0, #16*3] + ldr q4, [x_src0, #16*4] + ldr q5, [x_src0, #16*5] + ldr q6, [x_src0, #16*6] + ldr q7, [x_src0, #16*7] + ldr q8, [x_src0, #16*8] + ldr q9, [x_src0, #16*9] + ldr q10, [x_src0, #16*10] + ldr q11, [x_src0, #16*11] + ldr q12, [x_src0, #16*12] + ldr q13, [x_src0, #16*13] + ldr q14, [x_src0, #16*14] + ldr q15, [x_src0, #16*15] + add x_src0, x_src0, #256 + + cbz w_vects, .Lloop256_vects_end + + add x_src_ptr, x_src, #8 +.Lloop256_vects: + ldr x_srcn, [x_src_ptr], #8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_dst_ptr + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + ldr q24, [x_srcn, #16*8] + ldr q25, [x_srcn, #16*9] + ldr q26, [x_srcn, #16*10] + ldr q27, [x_srcn, #16*11] + ldr q28, [x_srcn, #16*12] + ldr q29, [x_srcn, #16*13] + ldr q30, [x_srcn, #16*14] + ldr q31, [x_srcn, #16*15] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + eor v8.16b, v8.16b, v24.16b + eor v9.16b, v9.16b, v25.16b + eor v10.16b, v10.16b, v26.16b + eor v11.16b, v11.16b, v27.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + + bne .Lloop256_vects + +.Lloop256_vects_end: + str q0, [x_dst, #16*0] + str q1, [x_dst, #16*1] + str q2, [x_dst, #16*2] + str q3, [x_dst, #16*3] + str q4, [x_dst, #16*4] + str q5, [x_dst, #16*5] + str q6, [x_dst, #16*6] + str q7, [x_dst, #16*7] + str q8, [x_dst, #16*8] + str q9, [x_dst, #16*9] + str q10, [x_dst, #16*10] + str q11, [x_dst, #16*11] + str q12, [x_dst, #16*12] + str q13, [x_dst, #16*13] + str q14, [x_dst, #16*14] + str q15, [x_dst, #16*15] + + cmp x_src0, x_src0_end + add x_dst, x_dst, #256 + add w_col, w_col, #256 + bls .Lloop256 + +.Lloop256_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_src0_end, x_src0_end, #256 + +.Lloop16_init: + /* len16 = len - len%16; len %= 16 */ + mov w_len16, w_len + and w_len, w_len, #0xF + sub w_len16, w_len16, w_len + + /* less than 16 bytes? */ + cbz w_len16, .Lloop1_init + + sub x_src0_end, x_src0_end, #16 + + /* batch process (vects-1)*16 bytes */ +.Lloop16: + ldr q0, [x_src0], #16 + cbz w_vects, .Lloop16_vects_end + + add x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_dst_ptr + ldr q1, [x_srcn, x_col] + eor v0.16b, v0.16b, v1.16b + bne .Lloop16_vects + +.Lloop16_vects_end: + cmp x_src0, x_src0_end + str q0, [x_dst], #16 + add w_col, w_col, #16 + bls .Lloop16 + +.Loop16_end: + add x_src0_end, x_src0_end, #16 + +.Lloop1_init: + cbnz w_len, .Lloop1 + mov w_ret, #0 + ret + + /* batch process (vects-1)*1 bytes */ +.Lloop1: + ldrb w_xor, [x_src0], #1 + cbz w_vects, .Lloop1_vects_end + + add x_src_ptr, x_src, #8 +.Lloop1_vects: + ldr x_srcn, [x_src_ptr], #8 + cmp x_src_ptr, x_dst_ptr + ldrb w_in, [x_srcn, x_col] + eor w_xor, w_xor, w_in + bne .Lloop1_vects + +.Lloop1_vects_end: + cmp x_src0, x_src0_end + strb w_xor, [x_dst], #1 + add w_col, w_col, #1 + bne .Lloop1 + +.Loop1_end: + mov w_ret, #0 + ret diff --git a/src/isa-l/raid/pq_check_sse.asm b/src/isa-l/raid/pq_check_sse.asm new file mode 100644 index 000000000..f2bc8a6cd --- /dev/null +++ b/src/isa-l/raid/pq_check_sse.asm @@ -0,0 +1,277 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_check_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 7*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm15, 6*16 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movdqa + %define XSTR movntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +mk_global pq_check_sse, function +func(pq_check_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + movdqa xpoly, [poly] + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Do end of vec first and run backward + +loop48: + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead + XLDR xp3, [ptr+pos+32] ;Initialize xp3 with P2 src + 32B ahead + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + pxor xq3, xq3 ;q3 = 0 + + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xp3, xs3 ; p3 ^= s2 + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xq3, xs3 ; q3 ^= s3 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pxor xtmp3, xtmp3 ; xtmp3 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + pand xtmp3, xpoly ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + paddb xq3, xq3 ; q3 = q3<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + pxor xp3, xs3 ;p3 ^= s3[0] + pxor xq3, xs3 ;q3 ^= 1 * s3[0] + + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src + XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead + XLDR xtmp3, [tmp+pos+32] ;re-init xq3 with Q2 src + 32B ahead + + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + pxor xq2, xtmp2 + pxor xq3, xtmp3 + + por xp1, xq1 ;Confirm that all P&Q parity are 0 + por xp1, xp2 + por xp1, xq2 + por xp1, xp3 + por xp1, xq3 + ptest xp1, xp1 + jnz return_fail + add pos, 48 + cmp pos, len + jle loop48 + + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + pxor xq1, xq1 ;q = 0 + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q ^= s + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + + por xp1, xq1 ;Confirm that all P&Q parity are = 0 + ptest xp1, xp1 + jnz return_fail + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_check_sse, 00, 06, 0033 diff --git a/src/isa-l/raid/pq_check_sse_i32.asm b/src/isa-l/raid/pq_check_sse_i32.asm new file mode 100644 index 000000000..3271c035a --- /dev/null +++ b/src/isa-l/raid/pq_check_sse_i32.asm @@ -0,0 +1,282 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define PS 8 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define PS 8 + %define tmp r11 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 edx + %define arg1 ecx + %define return eax + %define PS 4 + %define func(x) x: endbranch + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg0, arg(0) + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer? + pop ebp + %endmacro + +%endif ; output formats + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] + %define xpoly xmm15 +%elifidn PS,4 ; 32-bit code + %define xpoly [poly] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +mk_global pq_check_sse, function +func(pq_check_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 +%ifidn PS,8 + movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg +%endif + cmp len, 32 + jl loop16 + +len_aligned_32bytes: + sub len, 32 ;Do end of vec first and run backward + +loop32: + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src + XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead + + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + pxor xq2, xtmp2 + + por xp1, xq1 ;Confirm that all P&Q parity are 0 + por xp1, xp2 + por xp1, xq2 + ptest xp1, xp1 + jnz return_fail + add pos, 32 + cmp pos, len + jle loop32 + + + ;; ------------------------------ + ;; Do last 16 Bytes remaining + add len, 32 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + pxor xq1, xq1 ;q = 0 + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q ^= s + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + + por xp1, xq1 ;Confirm that all P&Q parity are = 0 + ptest xp1, xp1 + jnz return_fail + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_check_sse, 00, 06, 0033 diff --git a/src/isa-l/raid/pq_check_test.c b/src/isa-l/raid/pq_check_test.c new file mode 100644 index 000000000..27d0203d2 --- /dev/null +++ b/src/isa-l/raid/pq_check_test.c @@ -0,0 +1,304 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int ref_multi_pq(int vects, int len, void **array) +{ + int i, j; + unsigned char p, q, s; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + p ^= s = src[j][i]; + q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); // mult by GF{2} + } + + src[vects - 2][i] = p; // second to last pointer is p + src[vects - 1][i] = q; // last pointer is q + } + return 0; +} + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 2]; + char c; + char *tmp_buf[TEST_SOURCES + 2]; + int serr, lerr; + + printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + void *buf; + if (posix_memalign(&buf, 16, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs); + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("\nfail zero test %d\n", ret); + } + + ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test %d\n", ret); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer + + // Test corrupted buffer any location on all sources + for (j = 0; j < TEST_SOURCES + 2; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + ((char *)buffs[j])[i] = 0x5; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = 0; // un-corrupt buffer + } + putchar('.'); + } + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 2; i++) + rand_buffer(buffs[i], TEST_LEN); + + ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs); + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("fail first rand test %d\n", ret); + } + + c = ((char *)(buffs[0]))[TEST_LEN - 2]; + ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1; + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nFail corrupt buffer test, passed when should have failed\n"); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer + + // Test corrupted buffer any location on all sources w/ random data + for (j = 0; j < TEST_SOURCES + 2; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + // Check it still passes + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { // should pass + fail++; + printf + ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n", + j, i); + return 1; + } + c = ((char *)buffs[j])[i]; + ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { // Check it now fails + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = c; // un-corrupt buffer + } + putchar('.'); + } + + // Test various number of sources, full length + for (j = 4; j <= TEST_SOURCES + 2; j++) { + // New random data + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + // Generate p,q parity for this number of sources + ref_multi_pq(j, TEST_LEN, buffs); + + // Set errors up in each source and len position + for (i = 0; i < j; i++) { + for (k = 0; k < TEST_LEN; k++) { + // See if it still passes + ret = pq_check(j, TEST_LEN, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand fixed len test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[i])[k]; + ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer + + ret = pq_check(j, TEST_LEN, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand fixed len test corrupted buffer %d sources\n", + j); + fail++; + return 1; + } + ((char *)buffs[i])[k] = c; // un-corrupt buffer + } + } + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 16; + while (k <= TEST_LEN) { + char *tmp; + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + // Generate p,q parity for this number of sources + ref_multi_pq(j, k, buffs); + + // Inject errors at various source and len positions + for (lerr = 0; lerr < k; lerr++) { + for (serr = 0; serr < j; serr++) { + // See if it still passes + ret = pq_check(j, k, buffs); + if (ret != 0) { // Should pass + printf + ("\nfail rand var src, len test %d sources, len=%d\n", + j, k); + fail++; + return 1; + } + + tmp = (char *)buffs[serr]; + c = tmp[lerr]; + ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer + + ret = pq_check(j, k, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand var src, len test corrupted buffer " + "%d sources, len=%d, ret=%d\n", j, k, + ret); + fail++; + return 1; + } + ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer + } + } + putchar('.'); + fflush(0); + } + k += 16; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 16) { + for (j = 0; j < TEST_SOURCES + 2; j++) { + rand_buffer(buffs[j], TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + + // Test good data + ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + if (ret != 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + fail++; + return 1; + } + // Test bad data + for (serr = 0; serr < TEST_SOURCES + 2; serr++) { + for (lerr = 0; lerr < (TEST_LEN - i); lerr++) { + c = tmp_buf[serr][lerr]; + tmp_buf[serr][lerr] = c ^ 1; + + ret = + pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + if (ret == 0) { + printf("fail end test corrupted buffer - " + "offset: %d, len: %d, ret: %d\n", i, + TEST_LEN - i, ret); + fail++; + return 1; + } + + tmp_buf[serr][lerr] = c; + } + } + + putchar('.'); + fflush(0); + } + + if (fail == 0) + printf("Pass\n"); + + return fail; + +} diff --git a/src/isa-l/raid/pq_gen_avx.asm b/src/isa-l/raid/pq_gen_avx.asm new file mode 100644 index 000000000..db4bcfb1c --- /dev/null +++ b/src/isa-l/raid/pq_gen_avx.asm @@ -0,0 +1,254 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using AVX +;;; int pq_gen_avx(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 8*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm14 + vmovdqa [rsp + 7*16], xmm15 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm14, [rsp + 6*16] + vmovdqa xmm15, [rsp + 7*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xzero xmm14 +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +mk_global pq_gen_avx, function +func(pq_gen_avx) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + vmovdqa xpoly, [poly] + vpxor xzero, xzero, xzero + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Len points to last block + +loop48: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p1 = 0 + vpxor xp2, xp2, xp2 ;p2 = 0 + vpxor xp3, xp3, xp3 ;p3 = 0 + vpxor xq1, xq1, xq1 ;q1 = 0 + vpxor xq2, xq2, xq2 ;q2 = 0 + vpxor xq3, xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpxor xq2, xq2, xs2 ; q2 ^= s2 + vpxor xq3, xq3, xs3 ; q3 ^= s3 + vpxor xp1, xp1, xs1 ; p1 ^= s1 + vpxor xp2, xp2, xs2 ; p2 ^= s2 + vpxor xp3, xp3, xs3 ; p3 ^= s2 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00 + vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + vpaddb xq1, xq1, xq1 ; q1 = q1<<1 + vpaddb xq2, xq2, xq2 ; q2 = q2<<1 + vpaddb xq3, xq3, xq3 ; q3 = q3<<1 + vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0] + vpxor xp2, xp2, xs2 ;p2 ^= s2[0] + vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0] + vpxor xp3, xp3, xs3 ;p3 ^= s3[0] + vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [ptr+pos+32], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector + add pos, 48 + cmp pos, len + jle loop48 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p = 0 + vpxor xq1, xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpxor xp1, xp1, xs1 ; p ^= s + vpaddb xq1, xq1, xq1 ; q = q<<1 + vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_avx, 02, 0a, 0039 diff --git a/src/isa-l/raid/pq_gen_avx2.asm b/src/isa-l/raid/pq_gen_avx2.asm new file mode 100644 index 000000000..a0bf0cc40 --- /dev/null +++ b/src/isa-l/raid/pq_gen_avx2.asm @@ -0,0 +1,256 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using AVX +;;; int pq_gen_avx(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 32 bytes. Length must be 32 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 8*32 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + ;; Until a sav_ymm256 is defined + vmovdqu [rsp + 0*32], ymm6 + vmovdqu [rsp + 1*32], ymm7 + vmovdqu [rsp + 2*32], ymm8 + vmovdqu [rsp + 3*32], ymm9 + vmovdqu [rsp + 4*32], ymm10 + vmovdqu [rsp + 5*32], ymm11 + vmovdqu [rsp + 6*32], ymm14 + vmovdqu [rsp + 7*32], ymm15 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqu ymm6, [rsp + 0*32] + vmovdqu ymm7, [rsp + 1*32] + vmovdqu ymm8, [rsp + 2*32] + vmovdqu ymm9, [rsp + 3*32] + vmovdqu ymm10, [rsp + 4*32] + vmovdqu ymm11, [rsp + 5*32] + vmovdqu ymm14, [rsp + 6*32] + vmovdqu ymm15, [rsp + 7*32] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 ymm0 +%define xq1 ymm1 +%define xtmp1 ymm2 +%define xs1 ymm3 + +%define xp2 ymm4 +%define xq2 ymm5 +%define xtmp2 ymm6 +%define xs2 ymm7 + +%define xp3 ymm8 +%define xq3 ymm9 +%define xtmp3 ymm10 +%define xs3 ymm11 + +%define xzero ymm14 +%define xpoly ymm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +mk_global pq_gen_avx2, function +func(pq_gen_avx2) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (32-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + vmovdqa xpoly, [poly] + vpxor xzero, xzero, xzero + cmp len, 96 + jl loop32 + +len_aligned_32bytes: + sub len, 3*32 ;Len points to last block + +loop96: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+32] ;Preload last vector (source) + XLDR xs3, [ptr+pos+64] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p1 = 0 + vpxor xp2, xp2, xp2 ;p2 = 0 + vpxor xp3, xp3, xp3 ;p3 = 0 + vpxor xq1, xq1, xq1 ;q1 = 0 + vpxor xq2, xq2, xq2 ;q2 = 0 + vpxor xq3, xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpxor xq2, xq2, xs2 ; q2 ^= s2 + vpxor xq3, xq3, xs3 ; q3 ^= s3 + vpxor xp1, xp1, xs1 ; p1 ^= s1 + vpxor xp2, xp2, xs2 ; p2 ^= s2 + vpxor xp3, xp3, xs3 ; p3 ^= s2 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00 + vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+32] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+64] ; Get next vector (source data3) + vpaddb xq1, xq1, xq1 ; q1 = q1<<1 + vpaddb xq2, xq2, xq2 ; q2 = q2<<1 + vpaddb xq3, xq3, xq3 ; q3 = q3<<1 + vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0] + vpxor xp2, xp2, xs2 ;p2 ^= s2[0] + vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0] + vpxor xp3, xp3, xs3 ;p3 ^= s3[0] + vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+32], xp2 ;Write parity P2 vector + XSTR [ptr+pos+64], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+32], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+64], xq3 ;Write parity Q3 vector + add pos, 3*32 + cmp pos, len + jle loop96 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 3*32 + cmp pos, len + je return_pass + +loop32: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p = 0 + vpxor xq1, xq1, xq1 ;q = 0 + +next_vect32: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpxor xp1, xp1, xs1 ; p ^= s + vpaddb xq1, xq1, xq1 ; q = q<<1 + vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect32 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 32 + cmp pos, len + jl loop32 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 32 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_avx2, 04, 03, 0041 diff --git a/src/isa-l/raid/pq_gen_avx512.asm b/src/isa-l/raid/pq_gen_avx512.asm new file mode 100644 index 000000000..179ad5c28 --- /dev/null +++ b/src/isa-l/raid/pq_gen_avx512.asm @@ -0,0 +1,235 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using AVX512 +;;; int pq_gen_avx512(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined. +;;; Length must be 32 byte multiple. + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 4*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 + vmovdqu [rsp + 2*16], xmm8 + vmovdqu [rsp + 3*16], xmm9 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm9, [rsp + 3*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 zmm0 +%define xq1 zmm1 +%define xtmp1 zmm2 +%define xs1 zmm3 + +%define xp2 zmm4 +%define xq2 zmm5 +%define xtmp2 zmm6 +%define xs2 zmm7 + +%define xzero zmm8 +%define xpoly zmm9 + +%define xp1y ymm0 +%define xq1y ymm1 +%define xtmp1y ymm2 +%define xs1y ymm3 +%define xzeroy ymm8 +%define xpolyy ymm9 + +%define NO_NT_LDST +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqu8 ;u8 + %define XSTR vmovdqu8 +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +mk_global pq_gen_avx512, function +func(pq_gen_avx512) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (32-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + mov tmp, 0x1d + vpbroadcastb xpoly, tmp + vpxorq xzero, xzero, xzero + cmp len, 128 + jl loop32 + +len_aligned_32bytes: + sub len, 2*64 ;Len points to last block + +loop128: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+64] ;Preload last vector (source) + vpxorq xp1, xp1, xp1 ;p1 = 0 + vpxorq xp2, xp2, xp2 ;p2 = 0 + vpxorq xq1, xq1, xq1 ;q1 = 0 + vpxorq xq2, xq2, xq2 ;q2 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxorq xq1, xq1, xs1 ; q1 ^= s1 + vpxorq xq2, xq2, xs2 ; q2 ^= s2 + vpxorq xp1, xp1, xs1 ; p1 ^= s1 + vpxorq xp2, xp2, xs2 ; p2 ^= s2 + vpcmpb k1, xq1, xzero, 1 + vpcmpb k2, xq2, xzero, 1 + vpblendmb xtmp1 {k1}, xzero, xpoly + vpblendmb xtmp2 {k2}, xzero, xpoly + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+64] ; Get next vector (source data2) + vpaddb xq1, xq1, xq1 ; q1 = q1<<1 + vpaddb xq2, xq2, xq2 ; q2 = q2<<1 + vpxorq xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + vpxorq xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxorq xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + vpxorq xq1, xq1, xs1 ;q1 ^= 1 * s1[0] + vpxorq xp2, xp2, xs2 ;p2 ^= s2[0] + vpxorq xq2, xq2, xs2 ;q2 ^= 1 * s2[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+64], xp2 ;Write parity P2 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+64], xq2 ;Write parity Q2 vector + add pos, 2*64 + cmp pos, len + jle loop128 + + ;; ------------------------------ + ;; Do last 32 or 64 Bytes remaining + add len, 2*64 + cmp pos, len + je return_pass + +loop32: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1y, [ptr+pos] ;Preload last vector (source) + vpxorq xp1y, xp1y, xp1y ;p = 0 + vpxorq xq1y, xq1y, xq1y ;q = 0 + +next_vect32: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxorq xq1y, xq1y, xs1y ; q1 ^= s1 + vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00 + vpxorq xp1y, xp1y, xs1y ; p ^= s + vpaddb xq1y, xq1y, xq1y ; q = q<<1 + vpxorq xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked + XLDR xs1y, [ptr+pos] ; Get next vector (source data) + jg next_vect32 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded + vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1y ;Write parity P vector + XSTR [tmp+pos], xq1y ;Write parity Q vector + add pos, 32 + cmp pos, len + jl loop32 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/src/isa-l/raid/pq_gen_perf.c b/src/isa-l/raid/pq_gen_perf.c new file mode 100644 index 000000000..7315c82b3 --- /dev/null +++ b/src/isa-l/raid/pq_gen_perf.c @@ -0,0 +1,88 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include<sys/time.h> +#include "raid.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_SOURCES 10 +# define TEST_LEN 8*1024 +# define TEST_TYPE_STR "_warm" +#else +# ifndef TEST_CUSTOM +// Uncached test. Pull from large mem base. +# define TEST_SOURCES 10 +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1)) +# define TEST_TYPE_STR "_cold" +# else +# define TEST_TYPE_STR "_cus" +# endif +#endif + +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) + +int main(int argc, char *argv[]) +{ + int i; + void *buffs[TEST_SOURCES + 2]; + struct perf start; + + printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + int ret; + void *buf; + ret = posix_memalign(&buf, 64, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Setup data + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + // Warm up + BENCHMARK(&start, BENCHMARK_TIME, pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs)); + printf("pq_gen" TEST_TYPE_STR ": "); + perf_print(start, (long long)TEST_MEM); + + return 0; +} diff --git a/src/isa-l/raid/pq_gen_sse.asm b/src/isa-l/raid/pq_gen_sse.asm new file mode 100644 index 000000000..b6d51481b --- /dev/null +++ b/src/isa-l/raid/pq_gen_sse.asm @@ -0,0 +1,258 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 7*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm15, 6*16 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm15, [rsp + 6*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +mk_global pq_gen_sse, function +func(pq_gen_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + movdqa xpoly, [poly] + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Len points to last block + +loop48: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + pxor xp1, xp1 ;p1 = 0 + pxor xp2, xp2 ;p2 = 0 + pxor xp3, xp3 ;p3 = 0 + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + pxor xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xq3, xs3 ; q3 ^= s3 + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xp3, xs3 ; p3 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pxor xtmp3, xtmp3 ; xtmp3 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + pand xtmp3, xpoly ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + paddb xq3, xq3 ; q3 = q3<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + pxor xp3, xs3 ;p3 ^= s3[0] + pxor xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [ptr+pos+32], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector + add pos, 48 + cmp pos, len + jle loop48 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + pxor xp1, xp1 ;p = 0 + pxor xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_sse, 00, 09, 0032 diff --git a/src/isa-l/raid/pq_gen_sse_i32.asm b/src/isa-l/raid/pq_gen_sse_i32.asm new file mode 100644 index 000000000..8dabb783f --- /dev/null +++ b/src/isa-l/raid/pq_gen_sse_i32.asm @@ -0,0 +1,264 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define PS 8 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define PS 8 + %define tmp r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 edx + %define arg1 ecx + %define return eax + %define PS 4 + %define func(x) x: endbranch + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg0, arg(0) + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer? + pop ebp + %endmacro + +%endif ; output formats + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] + %define xpoly xmm15 +%elifidn PS,4 ; 32-bit code + %define xpoly [poly] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +mk_global pq_gen_sse, function +func(pq_gen_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 +%ifidn PS,8 + movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg +%endif + cmp len, 32 + jl loop16 + +len_aligned_32bytes: + sub len, 32 ;Do end of vec first and run backward + +loop32: + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + pxor xp1, xp1 ;p1 = 0 + pxor xq1, xq1 ;q1 = 0 + pxor xp2, xp2 ;p2 = 0 + pxor xq2, xq2 ;q2 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + add pos, 32 + cmp pos, len + jle loop32 + + ;; ------------------------------ + ;; Do last 16 Bytes remaining + add len, 32 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + pxor xp1, xp1 ;p = 0 + pxor xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_sse, 00, 08, 0032 diff --git a/src/isa-l/raid/pq_gen_test.c b/src/isa-l/raid/pq_gen_test.c new file mode 100644 index 000000000..3469f7e50 --- /dev/null +++ b/src/isa-l/raid/pq_gen_test.c @@ -0,0 +1,194 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include<limits.h> +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int dump(unsigned char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", buf[i++]); + if (i % 16 == 0) + printf("\n"); + } + printf("\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 2]; // Pointers to src and dest + char *tmp_buf[TEST_SOURCES + 2]; + + printf("Test pq_gen_test "); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES])[i] != 0) + fail++; + } + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES + 1])[i] != 0) + fail++; + } + + if (fail > 0) { + printf("fail zero test %d\n", fail); + return 1; + } else + putchar('.'); + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 2; i++) + rand_buffer(buffs[i], TEST_LEN); + + ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs); + + if (fail > 0) { + int t; + printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret); + for (t = 0; t < TEST_SOURCES + 2; t++) + dump(buffs[t], 15); + + printf(" reference function p,q\n"); + pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs); + for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++) + dump(buffs[t], 15); + + return 1; + } else + putchar('.'); + + // Test various number of sources + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + pq_gen(j, TEST_LEN, buffs); + fail |= pq_check_base(j, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d sources\n", j); + return 1; + } else + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 0; + while (k <= TEST_LEN) { + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + ret = pq_gen(j, k, buffs); + fail |= pq_check_base(j, k, buffs); + + if (fail > 0) { + printf("fail rand test %d sources, len=%d, fail=" + "%d, ret=%d\n", j, k, fail, ret); + return 1; + } + } + putchar('.'); + k += 32; + } + + // Test at the end of buffer + k = 0; + while (k <= TEST_LEN) { + for (j = 0; j < (TEST_SOURCES + 2); j++) { + rand_buffer(buffs[j], TEST_LEN - k); + tmp_buf[j] = (char *)buffs[j] + k; + } + + ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf); + fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf); + + if (fail > 0) { + printf("fail end test - offset: %d, len: %d, fail: %d, " + "ret: %d\n", k, TEST_LEN - k, fail, ret); + return 1; + } + + putchar('.'); + fflush(0); + k += 32; + } + + if (!fail) + printf(" done: Pass\n"); + + return fail; +} diff --git a/src/isa-l/raid/raid_base.c b/src/isa-l/raid/raid_base.c new file mode 100644 index 000000000..e066eb851 --- /dev/null +++ b/src/isa-l/raid/raid_base.c @@ -0,0 +1,147 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <limits.h> +#include <stdint.h> + +#if __WORDSIZE == 64 || _WIN64 || __x86_64__ +# define notbit0 0xfefefefefefefefeULL +# define bit7 0x8080808080808080ULL +# define gf8poly 0x1d1d1d1d1d1d1d1dULL +#else +# define notbit0 0xfefefefeUL +# define bit7 0x80808080UL +# define gf8poly 0x1d1d1d1dUL +#endif + +int pq_gen_base(int vects, int len, void **array) +{ + int i, j; + unsigned long p, q, s; + unsigned long **src = (unsigned long **)array; + int blocks = len / sizeof(long); + + for (i = 0; i < blocks; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + p ^= s = src[j][i]; + q = s ^ (((q << 1) & notbit0) ^ // shift each byte + ((((q & bit7) << 1) - ((q & bit7) >> 7)) // mask out bytes + & gf8poly)); // apply poly + } + + src[vects - 2][i] = p; // second to last pointer is p + src[vects - 1][i] = q; // last pointer is q + } + return 0; +} + +int pq_check_base(int vects, int len, void **array) +{ + int i, j; + unsigned char p, q, s; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + s = src[j][i]; + p ^= s; + + // mult by GF{2} + q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); + } + + if (src[vects - 2][i] != p) // second to last pointer is p + return i | 1; + if (src[vects - 1][i] != q) // last pointer is q + return i | 2; + } + return 0; +} + +int xor_gen_base(int vects, int len, void **array) +{ + int i, j; + unsigned char parity; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + parity = src[0][i]; + for (j = 1; j < vects - 1; j++) + parity ^= src[j][i]; + + src[vects - 1][i] = parity; // last pointer is dest + + } + + return 0; +} + +int xor_check_base(int vects, int len, void **array) +{ + int i, j, fail = 0; + + unsigned char parity; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + parity = 0; + for (j = 0; j < vects; j++) + parity ^= src[j][i]; + + if (parity != 0) { + fail = 1; + break; + } + } + if (fail && len > 0) + return len; + return fail; +} + +struct slver { + unsigned short snum; + unsigned char ver; + unsigned char core; +}; + +struct slver pq_gen_base_slver_0001012a; +struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 }; + +struct slver xor_gen_base_slver_0001012b; +struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 }; + +struct slver pq_check_base_slver_0001012c; +struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 }; + +struct slver xor_check_base_slver_0001012d; +struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 }; diff --git a/src/isa-l/raid/raid_base_aliases.c b/src/isa-l/raid/raid_base_aliases.c new file mode 100644 index 000000000..f81792a00 --- /dev/null +++ b/src/isa-l/raid/raid_base_aliases.c @@ -0,0 +1,50 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "raid.h" + +int pq_gen(int vects, int len, void **array) +{ + return pq_gen_base(vects, len, array); +} + +int pq_check(int vects, int len, void **array) +{ + return pq_check_base(vects, len, array); +} + +int xor_gen(int vects, int len, void **array) +{ + return xor_gen_base(vects, len, array); +} + +int xor_check(int vects, int len, void **array) +{ + return xor_check_base(vects, len, array); +} diff --git a/src/isa-l/raid/raid_multibinary.asm b/src/isa-l/raid/raid_multibinary.asm new file mode 100644 index 000000000..47ef1e369 --- /dev/null +++ b/src/isa-l/raid/raid_multibinary.asm @@ -0,0 +1,143 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" + +default rel +[bits 64] + +extern pq_gen_base +extern pq_gen_sse +extern pq_gen_avx +extern pq_gen_avx2 + +extern xor_gen_base +extern xor_gen_sse +extern xor_gen_avx + +extern pq_check_base +extern pq_check_sse + +extern xor_check_base +extern xor_check_sse + +%ifdef HAVE_AS_KNOWS_AVX512 + extern xor_gen_avx512 + extern pq_gen_avx512 +%endif + +mbin_interface xor_gen +mbin_interface pq_gen + + +mbin_dispatch_init6 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx, xor_gen_avx512 +mbin_dispatch_init6 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2, pq_gen_avx512 + +section .data + +xor_check_dispatched: + dq xor_check_mbinit +pq_check_dispatched: + dq pq_check_mbinit + +section .text + +;;;; +; pq_check multibinary function +;;;; +mk_global pq_check, function +pq_check_mbinit: + endbranch + call pq_check_dispatch_init +pq_check: + endbranch + jmp qword [pq_check_dispatched] + +pq_check_dispatch_init: + push rax + push rbx + push rcx + push rdx + push rsi + lea rsi, [pq_check_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + test ecx, FLAG_CPUID1_ECX_SSE4_1 + lea rbx, [pq_check_sse WRT_OPT] + cmovne rsi, rbx + + mov [pq_check_dispatched], rsi + pop rsi + pop rdx + pop rcx + pop rbx + pop rax + ret + + +;;;; +; xor_check multibinary function +;;;; +mk_global xor_check, function +xor_check_mbinit: + endbranch + call xor_check_dispatch_init +xor_check: + endbranch + jmp qword [xor_check_dispatched] + +xor_check_dispatch_init: + push rax + push rbx + push rcx + push rdx + push rsi + lea rsi, [xor_check_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + test ecx, FLAG_CPUID1_ECX_SSE4_1 + lea rbx, [xor_check_sse WRT_OPT] + cmovne rsi, rbx + + mov [xor_check_dispatched], rsi + pop rsi + pop rdx + pop rcx + pop rbx + pop rax + ret + +;;; func core, ver, snum +slversion xor_gen, 00, 03, 0126 +slversion xor_check, 00, 03, 0127 +slversion pq_gen, 00, 03, 0128 +slversion pq_check, 00, 03, 0129 diff --git a/src/isa-l/raid/raid_multibinary_i32.asm b/src/isa-l/raid/raid_multibinary_i32.asm new file mode 100644 index 000000000..eee7fd5a1 --- /dev/null +++ b/src/isa-l/raid/raid_multibinary_i32.asm @@ -0,0 +1,52 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" +%include "multibinary.asm" + +[bits 32] + +extern xor_gen_base +extern xor_gen_sse +extern pq_gen_base +extern pq_gen_sse +extern xor_check_base +extern xor_check_sse +extern pq_check_base +extern pq_check_sse + +mbin_interface xor_gen +mbin_interface pq_gen +mbin_interface xor_check +mbin_interface pq_check + +mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_sse, xor_gen_sse +mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_sse, pq_gen_sse +mbin_dispatch_init5 xor_check, xor_check_base, xor_check_sse, xor_check_sse, xor_check_sse +mbin_dispatch_init5 pq_check, pq_check_base, pq_check_sse, pq_check_sse, pq_check_sse diff --git a/src/isa-l/raid/xor_check_sse.asm b/src/isa-l/raid/xor_check_sse.asm new file mode 100644 index 000000000..a5fe0b2e0 --- /dev/null +++ b/src/isa-l/raid/xor_check_sse.asm @@ -0,0 +1,285 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using SSE +;;; int xor_gen_sse(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 16 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 rax + %define tmp2.b al + %define tmp3 arg4 + %define return rax + %define PS 8 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define tmp2 rax + %define tmp2.b al + %define PS 8 + %define tmp r11 + %define tmp3 r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 arg(0) + %define arg1 ecx + %define tmp2 eax + %define tmp2.b al + %define tmp3 edx + %define return eax + %define PS 4 + %define func(x) x: endbranch + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer + pop ebp + %endmacro + +%endif ; output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos tmp3 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +mk_global xor_check_sse, function +func(xor_check_sse) + FUNC_SAVE +%ifidn PS,8 ;64-bit code + sub vec, 1 ; Keep as offset to last source +%else ;32-bit code + mov tmp, arg(0) ; Update vec length arg to last source + sub tmp, 1 + mov arg(0), tmp +%endif + + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + mov tmp, vec ;Preset to last vector + +loop128: + mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector + XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7 + XLDR xmm2, [tmp2+pos+(2*16)] + XLDR xmm3, [tmp2+pos+(3*16)] + XLDR xmm4, [tmp2+pos+(4*16)] + XLDR xmm5, [tmp2+pos+(5*16)] + XLDR xmm6, [tmp2+pos+(6*16)] + XLDR xmm7, [tmp2+pos+(7*16)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + xorpd xmm0, [ptr+pos] ;Get next vector (source) + xorpd xmm1, [ptr+pos+16] + xorpd xmm2, [ptr+pos+(2*16)] + xorpd xmm3, [ptr+pos+(3*16)] + xorpd xmm4, [ptr+pos+(4*16)] + xorpd xmm5, [ptr+pos+(5*16)] + xorpd xmm6, [ptr+pos+(6*16)] + xorpd xmm7, [ptr+pos+(7*16)] +;;; prefetch [ptr+pos+(8*16)] + jge next_vect ;Loop for each vect + + ;; End of vects, chech that all parity regs = 0 + mov tmp, vec ;Back to last vector + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + por xmm0, xmm4 + por xmm0, xmm5 + por xmm0, xmm6 + por xmm0, xmm7 + ptest xmm0, xmm0 + jnz return_fail + + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + FUNC_RESTORE + mov return, 0 + ret + + + +;;; Do one byte at a time for no alignment case + +xor_gen_byte: + mov tmp, vec ;Preset to last vector + +loop_1byte: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec ;Back to last vector + cmp tmp2.b, 0 + jne return_fail + sub len, 1 + test len, (8-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne xor_gen_byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time + mov tmp, vec ;Preset to last vector + + ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes +loopN_bytes: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_Nbytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_Nbytes ;Loop for each source + + mov tmp, vec ;Back to last vector + cmp tmp2, 0 + jne return_fail + sub len, PS + sub tmp3, PS + jg loopN_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_check_sse, 00, 03, 0031 + diff --git a/src/isa-l/raid/xor_check_test.c b/src/isa-l/raid/xor_check_test.c new file mode 100644 index 000000000..c7532076f --- /dev/null +++ b/src/isa-l/raid/xor_check_test.c @@ -0,0 +1,280 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 1]; + char c; + int serr, lerr; + char *tmp_buf[TEST_SOURCES + 1]; + + printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + if (posix_memalign(&buf, 16, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs); + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("\nfail zero test %d\n", ret); + } + + ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test %d\n", ret); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer + + // Test corrupted buffer any location on all sources + for (j = 0; j < TEST_SOURCES + 1; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + ((char *)buffs[j])[i] = 0x5; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = 0; // un-corrupt buffer + } + putchar('.'); + } + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 1; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs); + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("fail first rand test %d\n", ret); + } + + c = ((char *)(buffs[0]))[TEST_LEN - 2]; + ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1; + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nFail corrupt buffer test, passed when should have failed\n"); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer + + // Test corrupted buffer any location on all sources w/ random data + for (j = 0; j < TEST_SOURCES + 1; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + // Check it still passes + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { // should pass + fail++; + printf + ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n", + j, i); + return 1; + } + c = ((char *)buffs[j])[i]; + ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { // Check it now fails + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = c; // un-corrupt buffer + } + putchar('.'); + } + + // Test various number of sources, full length + for (j = 3; j <= TEST_SOURCES + 1; j++) { + // New random data + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + // Generate xor parity for this number of sources + xor_gen_base(j, TEST_LEN, buffs); + + // Set errors up in each source and len position + for (i = 0; i < j; i++) { + for (k = 0; k < TEST_LEN; k++) { + // See if it still passes + ret = xor_check(j, TEST_LEN, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[i])[k]; + ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer + + ret = xor_check(j, TEST_LEN, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand test corrupted buffer %d sources\n", + j); + fail++; + return 1; + } + ((char *)buffs[i])[k] = c; // un-corrupt buffer + } + } + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 1; + while (k <= TEST_LEN) { + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + // Generate xor parity for this number of sources + xor_gen_base(j, k, buffs); + + // Inject errors at various source and len positions + for (lerr = 0; lerr < k; lerr += 10) { + for (serr = 0; serr < j; serr++) { + + // See if it still passes + ret = xor_check(j, k, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[serr])[lerr]; + ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer + + ret = xor_check(j, k, buffs); + if (ret == 0) { // Should fail + printf("\nfail rand test corrupted buffer " + "%d sources, len=%d, ret=%d\n", j, k, + ret); + fail++; + return 1; + } + ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer + } + } + } + putchar('.'); + fflush(0); + k += 1; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 32) { + for (j = 0; j < TEST_SOURCES + 1; j++) { + rand_buffer(buffs[j], TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + + // Test good data + ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + if (ret != 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + fail++; + return 1; + } + // Test bad data + for (serr = 0; serr < TEST_SOURCES + 1; serr++) { + for (lerr = 0; lerr < (TEST_LEN - i); lerr++) { + c = tmp_buf[serr][lerr]; + tmp_buf[serr][lerr] = c ^ 1; + + ret = + xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + if (ret == 0) { + printf("fail end test corrupted buffer - " + "offset: %d, len: %d, ret: %d\n", i, + TEST_LEN - i, ret); + fail++; + return 1; + } + + tmp_buf[serr][lerr] = c; + } + } + + putchar('.'); + fflush(0); + } + + if (fail == 0) + printf("Pass\n"); + + return fail; + +} diff --git a/src/isa-l/raid/xor_example.c b/src/isa-l/raid/xor_example.c new file mode 100644 index 000000000..48145ac90 --- /dev/null +++ b/src/isa-l/raid/xor_example.c @@ -0,0 +1,70 @@ +/********************************************************************** + Copyright(c) 2011-2013 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <stdio.h> +#include <stdlib.h> +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 16*1024 + +int main(int argc, char *argv[]) +{ + int i, j, should_pass, should_fail; + void *buffs[TEST_SOURCES + 1]; + + printf("XOR example\n"); + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + if (posix_memalign(&buf, 32, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + printf("Make random data\n"); + for (i = 0; i < TEST_SOURCES + 1; i++) + for (j = 0; j < TEST_LEN; j++) + ((char *)buffs[i])[j] = rand(); + + printf("Generate xor parity\n"); + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + printf("Check parity: "); + should_pass = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + printf("%s\n", should_pass == 0 ? "Pass" : "Fail"); + + printf("Find corruption: "); + ((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1; // flip one bit + should_fail = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); //recheck + printf("%s\n", should_fail != 0 ? "Pass" : "Fail"); + + return 0; +} diff --git a/src/isa-l/raid/xor_gen_avx.asm b/src/isa-l/raid/xor_gen_avx.asm new file mode 100644 index 000000000..b5527b204 --- /dev/null +++ b/src/isa-l/raid/xor_gen_avx.asm @@ -0,0 +1,228 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using AVX +;;; int xor_gen_avx(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 32 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define func(x) x: endbranch + %define return rax + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define func(x) proc_frame x + %define return rax + %define stack_size 2*32 + 8 ;must be an odd multiple of 8 + + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqu [rsp + 0*32], ymm6 + vmovdqu [rsp + 1*32], ymm7 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + vmovdqu ymm6, [rsp + 0*32] + vmovdqu ymm7, [rsp + 1*32] + add rsp, stack_size + %endmacro + +%endif ;output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define tmp2 rax +%define tmp2.b al +%define pos tmp3 +%define PS 8 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovdqa + %define XSTR vmovntdq +%endif + + +default rel +[bits 64] + +section .text + +align 16 +mk_global xor_gen_avx, function +func(xor_gen_avx) + + FUNC_SAVE + sub vec, 2 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + +loop128: + mov tmp, vec ;Back to last vector + mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR ymm0, [tmp2+pos] ;Start with end of array in last vector + XLDR ymm1, [tmp2+pos+32] ;Keep xor parity in xmm0-7 + XLDR ymm2, [tmp2+pos+(2*32)] + XLDR ymm3, [tmp2+pos+(3*32)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + XLDR ymm4, [ptr+pos] ;Get next vector (source) + XLDR ymm5, [ptr+pos+32] + XLDR ymm6, [ptr+pos+(2*32)] + XLDR ymm7, [ptr+pos+(3*32)] + vxorpd ymm0, ymm0, ymm4 ;Add to xor parity + vxorpd ymm1, ymm1, ymm5 + vxorpd ymm2, ymm2, ymm6 + vxorpd ymm3, ymm3, ymm7 + jge next_vect ;Loop for each source + + mov ptr, [arg2+PS+vec*PS] ;Address of parity vector + XSTR [ptr+pos], ymm0 ;Write parity xor vector + XSTR [ptr+pos+(1*32)], ymm1 + XSTR [ptr+pos+(2*32)], ymm2 + XSTR [ptr+pos+(3*32)], ymm3 + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + FUNC_RESTORE + mov return, 0 + ret + + +;;; Do one byte at a time for no alignment case +loop_1byte: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-1], tmp2.b ;Write parity + sub len, 1 + test len, (PS-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne loop_1byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 8 at a time + + ;; Run backwards 8 bytes at a time for (tmp3) bytes +loop8_bytes: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_8bytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_8bytes ;Loop for each source + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-PS], tmp2 ;Write parity + sub len, PS + sub tmp3, PS + jg loop8_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_gen_avx, 02, 05, 0037 + diff --git a/src/isa-l/raid/xor_gen_avx512.asm b/src/isa-l/raid/xor_gen_avx512.asm new file mode 100644 index 000000000..5b078682a --- /dev/null +++ b/src/isa-l/raid/xor_gen_avx512.asm @@ -0,0 +1,217 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using AVX512 +;;; int xor_gen_avx512(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 32 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifdef HAVE_AS_KNOWS_AVX512 + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define func(x) x: endbranch + %define return rax + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define func(x) proc_frame x + %define return rax + %define stack_size 2*16 + 8 ;must be an odd multiple of 8 + + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*316] + add rsp, stack_size + %endmacro + +%endif ;output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define tmp2 rax +%define tmp2.b al +%define pos tmp3 +%define PS 8 + +%define NO_NT_LDST +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqu8 + %define XSTR vmovdqu8 +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + + +default rel +[bits 64] + +section .text + +align 16 +mk_global xor_gen_avx512, function +func(xor_gen_avx512) + FUNC_SAVE + sub vec, 2 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + +loop128: + mov tmp, vec ;Back to last vector + mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR zmm0, [tmp2+pos] ;Start with end of array in last vector + XLDR zmm1, [tmp2+pos+64] ;Keep xor parity in xmm0-7 + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + XLDR zmm4, [ptr+pos] ;Get next vector (source) + XLDR zmm5, [ptr+pos+64] + vpxorq zmm0, zmm0, zmm4 ;Add to xor parity + vpxorq zmm1, zmm1, zmm5 + jge next_vect ;Loop for each source + + mov ptr, [arg2+PS+vec*PS] ;Address of parity vector + XSTR [ptr+pos], zmm0 ;Write parity xor vector + XSTR [ptr+pos+64], zmm1 + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + FUNC_RESTORE + mov return, 0 + ret + + +;;; Do one byte at a time for no alignment case +loop_1byte: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-1], tmp2.b ;Write parity + sub len, 1 + test len, (PS-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne loop_1byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 8 at a time + + ;; Run backwards 8 bytes at a time for (tmp3) bytes +loop8_bytes: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_8bytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_8bytes ;Loop for each source + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-PS], tmp2 ;Write parity + sub len, PS + sub tmp3, PS + jg loop8_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +%endif ; ifdef HAVE_AS_KNOWS_AVX512 diff --git a/src/isa-l/raid/xor_gen_perf.c b/src/isa-l/raid/xor_gen_perf.c new file mode 100644 index 000000000..717e0ada7 --- /dev/null +++ b/src/isa-l/raid/xor_gen_perf.c @@ -0,0 +1,90 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include<sys/time.h> +#include "raid.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_SOURCES 10 +# define TEST_LEN 8*1024 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_SOURCES 10 +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN GT_L3_CACHE / TEST_SOURCES +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) + +int main(int argc, char *argv[]) +{ + int i, ret, fail = 0; + void **buffs; + void *buff; + struct perf start; + + printf("Test xor_gen_perf\n"); + + ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs = buff; + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + ret = posix_memalign(&buf, 64, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Setup data + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + BENCHMARK(&start, BENCHMARK_TIME, xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs)); + printf("xor_gen" TEST_TYPE_STR ": "); + perf_print(start, (long long)TEST_MEM); + + return fail; +} diff --git a/src/isa-l/raid/xor_gen_sse.asm b/src/isa-l/raid/xor_gen_sse.asm new file mode 100644 index 000000000..f31ae63e4 --- /dev/null +++ b/src/isa-l/raid/xor_gen_sse.asm @@ -0,0 +1,284 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using SSE +;;; int xor_gen_sse(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 16 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 rax + %define tmp2.b al + %define tmp3 arg4 + %define return rax + %define PS 8 + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define tmp2 rax + %define tmp2.b al + %define PS 8 + %define tmp r11 + %define tmp3 r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 arg(0) + %define arg1 ecx + %define tmp2 eax + %define tmp2.b al + %define tmp3 edx + %define return eax + %define PS 4 + %define func(x) x: endbranch + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer + pop ebp + %endmacro + +%endif ; output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos tmp3 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +mk_global xor_gen_sse, function +func(xor_gen_sse) + FUNC_SAVE +%ifidn PS,8 ;64-bit code + sub vec, 2 ; Keep as offset to last source +%else ;32-bit code + mov tmp, arg(0) ; Update vec length arg to last source + sub tmp, 2 + mov arg(0), tmp +%endif + + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + mov tmp, vec ;Preset to last vector + +loop128: + mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector + XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7 + XLDR xmm2, [tmp2+pos+(2*16)] + XLDR xmm3, [tmp2+pos+(3*16)] + XLDR xmm4, [tmp2+pos+(4*16)] + XLDR xmm5, [tmp2+pos+(5*16)] + XLDR xmm6, [tmp2+pos+(6*16)] + XLDR xmm7, [tmp2+pos+(7*16)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + xorpd xmm0, [ptr+pos] ;Get next vector (source) + xorpd xmm1, [ptr+pos+16] + xorpd xmm2, [ptr+pos+(2*16)] + xorpd xmm3, [ptr+pos+(3*16)] + xorpd xmm4, [ptr+pos+(4*16)] + xorpd xmm5, [ptr+pos+(5*16)] + xorpd xmm6, [ptr+pos+(6*16)] + xorpd xmm7, [ptr+pos+(7*16)] +;;; prefetch [ptr+pos+(8*16)] + jge next_vect ;Loop for each vect + + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Address of parity vector + XSTR [ptr+pos], xmm0 ;Write parity xor vector + XSTR [ptr+pos+(1*16)], xmm1 + XSTR [ptr+pos+(2*16)], xmm2 + XSTR [ptr+pos+(3*16)], xmm3 + XSTR [ptr+pos+(4*16)], xmm4 + XSTR [ptr+pos+(5*16)], xmm5 + XSTR [ptr+pos+(6*16)], xmm6 + XSTR [ptr+pos+(7*16)], xmm7 + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + + +;;; Do one byte at a time for no alignment case + +xor_gen_byte: + mov tmp, vec ;Preset to last vector + +loop_1byte: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Get last vec + mov [ptr+len-1], tmp2.b ;Write parity + sub len, 1 + test len, (8-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne xor_gen_byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time + mov tmp, vec ;Preset to last vector + + ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes +loopN_bytes: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_Nbytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_Nbytes ;Loop for each source + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Get last vec + mov [ptr+len-PS], tmp2 ;Write parity + sub len, PS + sub tmp3, PS + jg loopN_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_gen_sse, 00, 0c, 0030 + diff --git a/src/isa-l/raid/xor_gen_test.c b/src/isa-l/raid/xor_gen_test.c new file mode 100644 index 000000000..ee922bfaf --- /dev/null +++ b/src/isa-l/raid/xor_gen_test.c @@ -0,0 +1,165 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include<stdio.h> +#include<stdint.h> +#include<string.h> +#include<stdlib.h> +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 1]; + char *tmp_buf[TEST_SOURCES + 1]; + + printf("Test xor_gen_test "); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES])[i] != 0) + fail++; + } + + if (fail > 0) { + printf("fail zero test"); + return 1; + } else + putchar('.'); + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 1; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d\n", fail); + return 1; + } else + putchar('.'); + + // Test various number of sources + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen(j, TEST_LEN, buffs); + fail |= xor_check_base(j, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d sources\n", j); + return 1; + } else + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 0; + while (k <= TEST_LEN) { + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + xor_gen(j, k, buffs); + fail |= xor_check_base(j, k, buffs); + + if (fail > 0) { + printf("fail rand test %d sources, len=%d, ret=%d\n", j, k, + fail); + return 1; + } + } + putchar('.'); + k += 1; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 32) { + for (j = 0; j < TEST_SOURCES + 1; j++) { + rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + + if (fail > 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + return 1; + } + + putchar('.'); + fflush(0); + } + + if (!fail) + printf(" done: Pass\n"); + + return fail; +} |