summaryrefslogtreecommitdiffstats
path: root/src/isa-l/raid/aarch64
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/isa-l/raid/aarch64
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/isa-l/raid/aarch64')
-rw-r--r--src/isa-l/raid/aarch64/Makefile.am36
-rw-r--r--src/isa-l/raid/aarch64/pq_check_neon.S341
-rw-r--r--src/isa-l/raid/aarch64/pq_gen_neon.S282
-rw-r--r--src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c61
-rw-r--r--src/isa-l/raid/aarch64/raid_multibinary_arm.S36
-rw-r--r--src/isa-l/raid/aarch64/xor_check_neon.S271
-rw-r--r--src/isa-l/raid/aarch64/xor_gen_neon.S264
7 files changed, 1291 insertions, 0 deletions
diff --git a/src/isa-l/raid/aarch64/Makefile.am b/src/isa-l/raid/aarch64/Makefile.am
new file mode 100644
index 000000000..d08c8d67a
--- /dev/null
+++ b/src/isa-l/raid/aarch64/Makefile.am
@@ -0,0 +1,36 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+lsrc_aarch64 += \
+ raid/aarch64/xor_gen_neon.S \
+ raid/aarch64/pq_gen_neon.S \
+ raid/aarch64/xor_check_neon.S \
+ raid/aarch64/pq_check_neon.S \
+ raid/aarch64/raid_multibinary_arm.S \
+ raid/aarch64/raid_aarch64_dispatcher.c
diff --git a/src/isa-l/raid/aarch64/pq_check_neon.S b/src/isa-l/raid/aarch64/pq_check_neon.S
new file mode 100644
index 000000000..55ad79829
--- /dev/null
+++ b/src/isa-l/raid/aarch64/pq_check_neon.S
@@ -0,0 +1,341 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_check_neon
+.type pq_check_neon, %function
+
+/* int pq_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 3 */
+x_vects .req x0
+w_len .req w1 /* MUST be 16x bytes */
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dst_p .req x3
+x_dst_q .req x4
+x_dst_q_end .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_src_ptr_end .req x9
+x_src_last .req x10
+x_srcn .req x11
+w_min .req w12
+/* vectors */
+/* v0 ~ v7 : temporary p */
+/* v8 ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0 .req v24
+v_mask1 .req v25
+v_mask2 .req v26
+v_mask3 .req v27
+v_gf8poly .req v28
+v_0x80 .req v29
+
+/*
+ * src_ptr_end -->
+ * -------+----------+
+ * . | src[0] |
+ * . +----------+ +------------------+
+ * src_ptr --> | src[1] | - srcn -> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-4] |
+ * -------+----------+ src_last +------------------+
+ * src --> | src[v-3] | ---------> | buffer |
+ * +----------+ +------------------+
+ * | src[v-2] | - dst_p -> | buffer |
+ * +----------+ +------------------+
+ * | src[v-1] | - dst_q -> | buffer | dst_q_end
+ * +----------+ +------------------+
+ */
+
+pq_check_neon:
+ sub x_src_ptr_end, x_src, #8
+
+ sub w_vects, w_vects, #3
+ add x_src, x_src, x_vects, lsl #3
+
+ ldr x_src_last, [x_src]
+ ldp x_dst_p, x_dst_q, [x_src, #8]
+
+ add x_dst_q_end, x_dst_q, x_len
+
+ mov w_min, #-1
+ mov w_col, #0
+ movi v_gf8poly.16b, #0x1D
+ movi v_0x80.16b, #0x80
+
+.Lloop128_init:
+ /* less than 128 byts? */
+ cmp w_len, #128
+ blo .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_dst_q_end, x_dst_q_end, #128
+
+ /* batch process (vects-2)*128 bytes */
+ /* v0~v7: p; v8~v15: q; v16~v23: in */
+.Lloop128:
+ ldr q0, [x_src_last, #16*0]
+ ldr q1, [x_src_last, #16*1]
+ ldr q2, [x_src_last, #16*2]
+ ldr q3, [x_src_last, #16*3]
+ ldr q4, [x_src_last, #16*4]
+ ldr q5, [x_src_last, #16*5]
+ ldr q6, [x_src_last, #16*6]
+ ldr q7, [x_src_last, #16*7]
+ add x_src_last, x_src_last, #128
+
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ mov v11.16b, v3.16b
+ mov v12.16b, v4.16b
+ mov v13.16b, v5.16b
+ mov v14.16b, v6.16b
+ mov v15.16b, v7.16b
+
+ cbz w_vects, .Lloop128_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop128_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ cmhs v_mask0.16b, v8.16b, v_0x80.16b
+ cmhs v_mask1.16b, v9.16b, v_0x80.16b
+ cmhs v_mask2.16b, v10.16b, v_0x80.16b
+ cmhs v_mask3.16b, v11.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v8.16b, v8.16b, #1
+ shl v9.16b, v9.16b, #1
+ shl v10.16b, v10.16b, #1
+ shl v11.16b, v11.16b, #1
+ eor v8.16b, v8.16b, v_mask0.16b
+ eor v9.16b, v9.16b, v_mask1.16b
+ eor v10.16b, v10.16b, v_mask2.16b
+ eor v11.16b, v11.16b, v_mask3.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v9.16b, v9.16b, v17.16b
+ eor v10.16b, v10.16b, v18.16b
+ eor v11.16b, v11.16b, v19.16b
+
+ cmhs v_mask0.16b, v12.16b, v_0x80.16b
+ cmhs v_mask1.16b, v13.16b, v_0x80.16b
+ cmhs v_mask2.16b, v14.16b, v_0x80.16b
+ cmhs v_mask3.16b, v15.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v12.16b, v12.16b, #1
+ shl v13.16b, v13.16b, #1
+ shl v14.16b, v14.16b, #1
+ shl v15.16b, v15.16b, #1
+ eor v12.16b, v12.16b, v_mask0.16b
+ eor v13.16b, v13.16b, v_mask1.16b
+ eor v14.16b, v14.16b, v_mask2.16b
+ eor v15.16b, v15.16b, v_mask3.16b
+ eor v12.16b, v12.16b, v20.16b
+ eor v13.16b, v13.16b, v21.16b
+ eor v14.16b, v14.16b, v22.16b
+ eor v15.16b, v15.16b, v23.16b
+
+ bne .Lloop128_vects
+
+.Lloop128_vects_end:
+ /* v16~v23: true p, q */
+ ldr q16, [x_dst_p, #16*0]
+ ldr q17, [x_dst_p, #16*1]
+ ldr q18, [x_dst_p, #16*2]
+ ldr q19, [x_dst_p, #16*3]
+ ldr q20, [x_dst_p, #16*4]
+ ldr q21, [x_dst_p, #16*5]
+ ldr q22, [x_dst_p, #16*6]
+ ldr q23, [x_dst_p, #16*7]
+
+ cmeq v0.16b, v0.16b, v16.16b
+ cmeq v1.16b, v1.16b, v17.16b
+ cmeq v2.16b, v2.16b, v18.16b
+ cmeq v3.16b, v3.16b, v19.16b
+ cmeq v4.16b, v4.16b, v20.16b
+ cmeq v5.16b, v5.16b, v21.16b
+ cmeq v6.16b, v6.16b, v22.16b
+ cmeq v7.16b, v7.16b, v23.16b
+
+ ldr q16, [x_dst_q, #16*0]
+ ldr q17, [x_dst_q, #16*1]
+ ldr q18, [x_dst_q, #16*2]
+ ldr q19, [x_dst_q, #16*3]
+ ldr q20, [x_dst_q, #16*4]
+ ldr q21, [x_dst_q, #16*5]
+ ldr q22, [x_dst_q, #16*6]
+ ldr q23, [x_dst_q, #16*7]
+
+ and v0.16b, v0.16b, v1.16b
+ and v2.16b, v2.16b, v3.16b
+ and v4.16b, v4.16b, v5.16b
+ and v6.16b, v6.16b, v7.16b
+ and v0.16b, v0.16b, v2.16b
+ and v4.16b, v4.16b, v6.16b
+ and v0.16b, v0.16b, v4.16b
+
+ cmeq v8.16b, v8.16b, v16.16b
+ cmeq v9.16b, v9.16b, v17.16b
+ cmeq v10.16b, v10.16b, v18.16b
+ cmeq v11.16b, v11.16b, v19.16b
+ cmeq v12.16b, v12.16b, v20.16b
+ cmeq v13.16b, v13.16b, v21.16b
+ cmeq v14.16b, v14.16b, v22.16b
+ cmeq v15.16b, v15.16b, v23.16b
+
+ and v8.16b, v8.16b, v9.16b
+ and v10.16b, v10.16b, v11.16b
+ and v12.16b, v12.16b, v13.16b
+ and v14.16b, v14.16b, v15.16b
+ and v8.16b, v8.16b, v10.16b
+ and v12.16b, v12.16b, v14.16b
+ and v8.16b, v8.16b, v12.16b
+
+ and v0.16b, v0.16b, v8.16b
+
+ uminv b0, v0.16b
+ umov w_min, v0.b[0]
+ cbz w_min, .Lloop128_end
+
+ add x_dst_p, x_dst_p, #128
+ add x_dst_q, x_dst_q, #128
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #128
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ cbz w_min, .Lerror
+
+ add x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+ tst w_len, #0x7F
+ beq .Lloop16_end
+ sub x_dst_q_end, x_dst_q_end, #16
+
+ /* batch process (vects-2)*16 bytes */
+ /* v0: p; v1: q; v2: in; v3: mask */
+.Lloop16:
+ ldr q0, [x_src_last], #16
+ mov v1.16b, v0.16b
+
+ cbz w_vects, .Lloop16_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ ldr q2, [x_srcn, x_col]
+ cmp x_src_ptr, x_src_ptr_end
+
+ eor v0.16b, v0.16b, v2.16b
+
+ cmhs v3.16b, v1.16b, v_0x80.16b
+ and v3.16b, v3.16b, v_gf8poly.16b
+
+ shl v1.16b, v1.16b, #1
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v3.16b
+
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ /* v4: true p; v5: true q */
+ ldr q4, [x_dst_p], #16
+ ldr q5, [x_dst_q], #16
+ cmp x_dst_q, x_dst_q_end
+
+ cmeq v0.16b, v0.16b, v4.16b
+ cmeq v1.16b, v1.16b, v5.16b
+ and v0.16b, v0.16b, v1.16b
+
+ uminv b0, v0.16b
+ umov w_min, v0.b[0]
+ cbz w_min, .Lerror
+
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ mov w_ret, #0
+ ret
+
+.Lerror:
+ mov w_ret, #1
+ ret
diff --git a/src/isa-l/raid/aarch64/pq_gen_neon.S b/src/isa-l/raid/aarch64/pq_gen_neon.S
new file mode 100644
index 000000000..f60ad1211
--- /dev/null
+++ b/src/isa-l/raid/aarch64/pq_gen_neon.S
@@ -0,0 +1,282 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_gen_neon
+.type pq_gen_neon, %function
+
+/* int pq_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 3 */
+x_vects .req x0
+w_len .req w1 /* MUST be 16x bytes */
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dst_p .req x3
+x_dst_q .req x4
+x_dst_q_end .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_src_ptr_end .req x9
+x_src_last .req x10
+x_srcn .req x11
+/* vectors */
+/* v0 ~ v7 : temporary p */
+/* v8 ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0 .req v24
+v_mask1 .req v25
+v_mask2 .req v26
+v_mask3 .req v27
+v_gf8poly .req v28
+v_0x80 .req v29
+
+/*
+ * src_ptr_end -->
+ * -------+----------+
+ * . | src[0] |
+ * . +----------+ +------------------+
+ * src_ptr --> | src[1] | - srcn -> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-4] |
+ * -------+----------+ src_last +------------------+
+ * src --> | src[v-3] | ---------> | buffer |
+ * +----------+ +------------------+
+ * | src[v-2] | - dst_p -> | buffer |
+ * +----------+ +------------------+
+ * | src[v-1] | - dst_q -> | buffer | dst_q_end
+ * +----------+ +------------------+
+ */
+
+pq_gen_neon:
+ sub x_src_ptr_end, x_src, #8
+
+ sub w_vects, w_vects, #3
+ add x_src, x_src, x_vects, lsl #3
+
+ ldr x_src_last, [x_src]
+ ldp x_dst_p, x_dst_q, [x_src, #8]
+
+ add x_dst_q_end, x_dst_q, x_len
+
+ mov w_col, #0
+ movi v_gf8poly.16b, #0x1D
+ movi v_0x80.16b, #0x80
+
+.Lloop128_init:
+ /* less than 128 byts? */
+ cmp w_len, #128
+ blo .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_dst_q_end, x_dst_q_end, #128
+
+ /* batch process (vects-2)*128 bytes */
+ /* v0~v7: p; v8~v15: q; v16~v23: in */
+.Lloop128:
+ ldr q0, [x_src_last, #16*0]
+ ldr q1, [x_src_last, #16*1]
+ ldr q2, [x_src_last, #16*2]
+ ldr q3, [x_src_last, #16*3]
+ ldr q4, [x_src_last, #16*4]
+ ldr q5, [x_src_last, #16*5]
+ ldr q6, [x_src_last, #16*6]
+ ldr q7, [x_src_last, #16*7]
+ add x_src_last, x_src_last, #128
+
+ mov v8.16b, v0.16b
+ mov v9.16b, v1.16b
+ mov v10.16b, v2.16b
+ mov v11.16b, v3.16b
+ mov v12.16b, v4.16b
+ mov v13.16b, v5.16b
+ mov v14.16b, v6.16b
+ mov v15.16b, v7.16b
+
+ cbz w_vects, .Lloop128_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop128_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+
+ cmhs v_mask0.16b, v8.16b, v_0x80.16b
+ cmhs v_mask1.16b, v9.16b, v_0x80.16b
+ cmhs v_mask2.16b, v10.16b, v_0x80.16b
+ cmhs v_mask3.16b, v11.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v8.16b, v8.16b, #1
+ shl v9.16b, v9.16b, #1
+ shl v10.16b, v10.16b, #1
+ shl v11.16b, v11.16b, #1
+ eor v8.16b, v8.16b, v_mask0.16b
+ eor v9.16b, v9.16b, v_mask1.16b
+ eor v10.16b, v10.16b, v_mask2.16b
+ eor v11.16b, v11.16b, v_mask3.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v9.16b, v9.16b, v17.16b
+ eor v10.16b, v10.16b, v18.16b
+ eor v11.16b, v11.16b, v19.16b
+
+ cmhs v_mask0.16b, v12.16b, v_0x80.16b
+ cmhs v_mask1.16b, v13.16b, v_0x80.16b
+ cmhs v_mask2.16b, v14.16b, v_0x80.16b
+ cmhs v_mask3.16b, v15.16b, v_0x80.16b
+ and v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+ and v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+ and v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+ and v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+ shl v12.16b, v12.16b, #1
+ shl v13.16b, v13.16b, #1
+ shl v14.16b, v14.16b, #1
+ shl v15.16b, v15.16b, #1
+ eor v12.16b, v12.16b, v_mask0.16b
+ eor v13.16b, v13.16b, v_mask1.16b
+ eor v14.16b, v14.16b, v_mask2.16b
+ eor v15.16b, v15.16b, v_mask3.16b
+ eor v12.16b, v12.16b, v20.16b
+ eor v13.16b, v13.16b, v21.16b
+ eor v14.16b, v14.16b, v22.16b
+ eor v15.16b, v15.16b, v23.16b
+
+ bne .Lloop128_vects
+
+.Lloop128_vects_end:
+ str q0, [x_dst_p, #16*0]
+ str q1, [x_dst_p, #16*1]
+ str q2, [x_dst_p, #16*2]
+ str q3, [x_dst_p, #16*3]
+ str q4, [x_dst_p, #16*4]
+ str q5, [x_dst_p, #16*5]
+ str q6, [x_dst_p, #16*6]
+ str q7, [x_dst_p, #16*7]
+
+ str q8, [x_dst_q, #16*0]
+ str q9, [x_dst_q, #16*1]
+ str q10, [x_dst_q, #16*2]
+ str q11, [x_dst_q, #16*3]
+ str q12, [x_dst_q, #16*4]
+ str q13, [x_dst_q, #16*5]
+ str q14, [x_dst_q, #16*6]
+ str q15, [x_dst_q, #16*7]
+
+ add x_dst_p, x_dst_p, #128
+ add x_dst_q, x_dst_q, #128
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #128
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+ tst w_len, #0x7F
+ beq .Lloop16_end
+ sub x_dst_q_end, x_dst_q_end, #16
+
+ /* batch process (vects-2)*16 bytes */
+ /* v0: p; v1: q; v2: in; v3: mask */
+.Lloop16:
+ ldr q0, [x_src_last], #16
+ mov v1.16b, v0.16b
+
+ cbz w_vects, .Lloop16_vects_end
+
+ sub x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #-8
+ ldr q2, [x_srcn, x_col]
+ cmp x_src_ptr, x_src_ptr_end
+
+ eor v0.16b, v0.16b, v2.16b
+
+ cmhs v3.16b, v1.16b, v_0x80.16b
+ and v3.16b, v3.16b, v_gf8poly.16b
+
+ shl v1.16b, v1.16b, #1
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v3.16b
+
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q0, [x_dst_p], #16
+ str q1, [x_dst_q], #16
+ cmp x_dst_q, x_dst_q_end
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ mov w_ret, #0
+ ret
diff --git a/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
new file mode 100644
index 000000000..c81bd8c98
--- /dev/null
+++ b/src/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
@@ -0,0 +1,61 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(xor_gen)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(xor_gen_neon);
+ return PROVIDER_BASIC(xor_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(xor_check)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(xor_check_neon);
+ return PROVIDER_BASIC(xor_check);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_gen)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(pq_gen_neon);
+ return PROVIDER_BASIC(pq_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_check)
+{
+ if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+ return PROVIDER_INFO(pq_check_neon);
+ return PROVIDER_BASIC(pq_check);
+
+}
diff --git a/src/isa-l/raid/aarch64/raid_multibinary_arm.S b/src/isa-l/raid/aarch64/raid_multibinary_arm.S
new file mode 100644
index 000000000..0316239ec
--- /dev/null
+++ b/src/isa-l/raid/aarch64/raid_multibinary_arm.S
@@ -0,0 +1,36 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface xor_gen
+mbin_interface xor_check
+mbin_interface pq_gen
+mbin_interface pq_check
diff --git a/src/isa-l/raid/aarch64/xor_check_neon.S b/src/isa-l/raid/aarch64/xor_check_neon.S
new file mode 100644
index 000000000..95cb7d1d1
--- /dev/null
+++ b/src/isa-l/raid/aarch64/xor_check_neon.S
@@ -0,0 +1,271 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_check_neon
+.type xor_check_neon, %function
+
+/* int xor_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 2 */
+x_vects .req x0
+w_len .req w1
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+w_in .req w1 /* share w_len */
+x_src0 .req x3
+x_src0_end .req x4
+w_len256 .req w5 /* share w_len16 */
+x_len256 .req x5
+w_len16 .req w5
+x_len16 .req x5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_srcn .req x9
+x_src_ptr_end .req x10
+w_xor .req w11
+/* v0 ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ * +----------+ +------------------+
+ * src --> | src[0] | - src0 -> | buffer | src0_end
+ * --------+----------+ +------------------+
+ * . | ...... |
+ * . +----------+ +------------------+
+ * src_ptr ~~> | src[n] | - srcn ~> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-1] |
+ * --------+----------+
+ * src_ptr_end -->
+ */
+
+xor_check_neon:
+ add x_src_ptr_end, x_src, x_vects, lsl #3
+ ldr x_src0, [x_src]
+ add x_src0_end, x_src0, x_len
+
+ sub w_vects, w_vects, #1
+ mov w_col, #0
+ mov w_xor, #0
+
+.Lloop256_init:
+ /* len256 = len - len%256; len %= 256 */
+ mov w_len256, w_len
+ and w_len, w_len, #0xFF
+ sub w_len256, w_len256, w_len
+
+ /* less than 256 byts? */
+ cbz w_len256, .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src0_end, x_src0_end, #256
+
+ /* batch process vects*256 bytes */
+.Lloop256:
+ ldr q0, [x_src0, #16*0]
+ ldr q1, [x_src0, #16*1]
+ ldr q2, [x_src0, #16*2]
+ ldr q3, [x_src0, #16*3]
+ ldr q4, [x_src0, #16*4]
+ ldr q5, [x_src0, #16*5]
+ ldr q6, [x_src0, #16*6]
+ ldr q7, [x_src0, #16*7]
+ ldr q8, [x_src0, #16*8]
+ ldr q9, [x_src0, #16*9]
+ ldr q10, [x_src0, #16*10]
+ ldr q11, [x_src0, #16*11]
+ ldr q12, [x_src0, #16*12]
+ ldr q13, [x_src0, #16*13]
+ ldr q14, [x_src0, #16*14]
+ ldr q15, [x_src0, #16*15]
+ add x_src0, x_src0, #256
+
+ cbz w_vects, .Lloop256_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop256_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_src_ptr_end
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+ ldr q24, [x_srcn, #16*8]
+ ldr q25, [x_srcn, #16*9]
+ ldr q26, [x_srcn, #16*10]
+ ldr q27, [x_srcn, #16*11]
+ ldr q28, [x_srcn, #16*12]
+ ldr q29, [x_srcn, #16*13]
+ ldr q30, [x_srcn, #16*14]
+ ldr q31, [x_srcn, #16*15]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+ eor v8.16b, v8.16b, v24.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v10.16b, v10.16b, v26.16b
+ eor v11.16b, v11.16b, v27.16b
+ eor v12.16b, v12.16b, v28.16b
+ eor v13.16b, v13.16b, v29.16b
+ eor v14.16b, v14.16b, v30.16b
+ eor v15.16b, v15.16b, v31.16b
+
+ bne .Lloop256_vects
+
+.Lloop256_vects_end:
+ orr v0.16b, v0.16b, v1.16b
+ orr v2.16b, v2.16b, v3.16b
+ orr v4.16b, v4.16b, v5.16b
+ orr v6.16b, v6.16b, v7.16b
+ orr v8.16b, v8.16b, v9.16b
+ orr v10.16b, v10.16b, v11.16b
+ orr v12.16b, v12.16b, v13.16b
+ orr v14.16b, v14.16b, v15.16b
+ orr v0.16b, v0.16b, v2.16b
+ orr v4.16b, v4.16b, v6.16b
+ orr v8.16b, v8.16b, v10.16b
+ orr v12.16b, v12.16b, v14.16b
+ orr v0.16b, v0.16b, v4.16b
+ orr v8.16b, v8.16b, v12.16b
+ orr v0.16b, v0.16b, v8.16b
+ umaxv b0, v0.16b
+ umov w_xor, v0.b[0]
+ cbnz w_xor, .Lloop256_end
+
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #256
+ bls .Lloop256
+
+.Lloop256_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ cbnz w_xor, .Lerror
+
+ add x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+ /* len16 = len - len%16; len %= 16 */
+ mov w_len16, w_len
+ and w_len, w_len, #0xF
+ sub w_len16, w_len16, w_len
+
+ /* less than 16 bytes? */
+ cbz w_len16, .Lloop1_init
+
+ sub x_src0_end, x_src0_end, #16
+
+ /* batch process vects*16 bytes */
+.Lloop16:
+ ldr q0, [x_src0], #16
+ cbz w_vects, .Lloop16_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_src_ptr_end
+ ldr q1, [x_srcn, x_col]
+ eor v0.16b, v0.16b, v1.16b
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ umaxv b0, v0.16b
+ umov w_xor, v0.b[0]
+ cbnz w_xor, .Lerror
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Lloop16_end:
+ add x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+ cbnz w_len, .Lloop1
+ mov w_ret, #0
+ ret
+
+ /* batch process vects*1 bytes */
+.Lloop1:
+ ldrb w_xor, [x_src0], #1
+ cbz w_vects, .Lloop1_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop1_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_src_ptr_end
+ ldrb w_in, [x_srcn, x_col]
+ eor w_xor, w_xor, w_in
+ bne .Lloop1_vects
+
+.Lloop1_vects_end:
+ cbnz w_xor, .Lerror
+ cmp x_src0, x_src0_end
+ add w_col, w_col, #1
+ bne .Lloop1
+
+.Lloop1_end:
+ mov w_ret, #0
+ ret
+
+.Lerror:
+ mov w_ret, #1
+ ret
diff --git a/src/isa-l/raid/aarch64/xor_gen_neon.S b/src/isa-l/raid/aarch64/xor_gen_neon.S
new file mode 100644
index 000000000..00f65a2ef
--- /dev/null
+++ b/src/isa-l/raid/aarch64/xor_gen_neon.S
@@ -0,0 +1,264 @@
+########################################################################
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_gen_neon
+.type xor_gen_neon, %function
+
+/* int xor_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects .req w0 /* MUST >= 2 */
+x_vects .req x0
+w_len .req w1
+x_len .req x1
+x_src .req x2
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+w_in .req w1 /* share w_len */
+x_src0 .req x3
+x_src0_end .req x4
+w_len256 .req w5 /* share w_len16, w_xor */
+x_len256 .req x5
+w_len16 .req w5
+x_len16 .req x5
+w_xor .req w5
+w_col .req w6
+x_col .req x6
+x_src_ptr .req x7
+x_srcn .req x9
+x_dst .req x10
+x_dst_ptr .req x11
+/* v0 ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ * +----------+ +------------------+
+ * src --> | src[0] | - src0 -> | buffer | src0_end
+ * --------+----------+ +------------------+
+ * . | ...... |
+ * . +----------+ +------------------+
+ * src_ptr ~~> | src[n] | - srcn ~> | buffer |
+ * . +----------+ +------------------+
+ * . | ...... |
+ * . +----------+
+ * . | src[v-2] |
+ * --------+----------+ +------------------+
+ * dst_ptr --> | src[v-1] | -- dst --> | buffer |
+ * +----------+ +------------------+
+ */
+
+xor_gen_neon:
+ add x_dst_ptr, x_src, x_vects, lsl #3
+ ldr x_dst, [x_dst_ptr, #-8]!
+ ldr x_src0, [x_src]
+ add x_src0_end, x_src0, x_len
+
+ sub w_vects, w_vects, #2
+ mov w_col, #0
+
+.Loop256_init:
+ /* len256 = len - len%256; len %= 256 */
+ mov w_len256, w_len
+ and w_len, w_len, #0xFF
+ sub w_len256, w_len256, w_len
+
+ /* less than 256 byts? */
+ cbz w_len256, .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src0_end, x_src0_end, #256
+
+ /* batch process (vects-1)*256 bytes */
+.Lloop256:
+ ldr q0, [x_src0, #16*0]
+ ldr q1, [x_src0, #16*1]
+ ldr q2, [x_src0, #16*2]
+ ldr q3, [x_src0, #16*3]
+ ldr q4, [x_src0, #16*4]
+ ldr q5, [x_src0, #16*5]
+ ldr q6, [x_src0, #16*6]
+ ldr q7, [x_src0, #16*7]
+ ldr q8, [x_src0, #16*8]
+ ldr q9, [x_src0, #16*9]
+ ldr q10, [x_src0, #16*10]
+ ldr q11, [x_src0, #16*11]
+ ldr q12, [x_src0, #16*12]
+ ldr q13, [x_src0, #16*13]
+ ldr q14, [x_src0, #16*14]
+ ldr q15, [x_src0, #16*15]
+ add x_src0, x_src0, #256
+
+ cbz w_vects, .Lloop256_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop256_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ add x_srcn, x_srcn, x_col
+ cmp x_src_ptr, x_dst_ptr
+
+ ldr q16, [x_srcn, #16*0]
+ ldr q17, [x_srcn, #16*1]
+ ldr q18, [x_srcn, #16*2]
+ ldr q19, [x_srcn, #16*3]
+ ldr q20, [x_srcn, #16*4]
+ ldr q21, [x_srcn, #16*5]
+ ldr q22, [x_srcn, #16*6]
+ ldr q23, [x_srcn, #16*7]
+ ldr q24, [x_srcn, #16*8]
+ ldr q25, [x_srcn, #16*9]
+ ldr q26, [x_srcn, #16*10]
+ ldr q27, [x_srcn, #16*11]
+ ldr q28, [x_srcn, #16*12]
+ ldr q29, [x_srcn, #16*13]
+ ldr q30, [x_srcn, #16*14]
+ ldr q31, [x_srcn, #16*15]
+
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v6.16b, v6.16b, v22.16b
+ eor v7.16b, v7.16b, v23.16b
+ eor v8.16b, v8.16b, v24.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v10.16b, v10.16b, v26.16b
+ eor v11.16b, v11.16b, v27.16b
+ eor v12.16b, v12.16b, v28.16b
+ eor v13.16b, v13.16b, v29.16b
+ eor v14.16b, v14.16b, v30.16b
+ eor v15.16b, v15.16b, v31.16b
+
+ bne .Lloop256_vects
+
+.Lloop256_vects_end:
+ str q0, [x_dst, #16*0]
+ str q1, [x_dst, #16*1]
+ str q2, [x_dst, #16*2]
+ str q3, [x_dst, #16*3]
+ str q4, [x_dst, #16*4]
+ str q5, [x_dst, #16*5]
+ str q6, [x_dst, #16*6]
+ str q7, [x_dst, #16*7]
+ str q8, [x_dst, #16*8]
+ str q9, [x_dst, #16*9]
+ str q10, [x_dst, #16*10]
+ str q11, [x_dst, #16*11]
+ str q12, [x_dst, #16*12]
+ str q13, [x_dst, #16*13]
+ str q14, [x_dst, #16*14]
+ str q15, [x_dst, #16*15]
+
+ cmp x_src0, x_src0_end
+ add x_dst, x_dst, #256
+ add w_col, w_col, #256
+ bls .Lloop256
+
+.Lloop256_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+ /* len16 = len - len%16; len %= 16 */
+ mov w_len16, w_len
+ and w_len, w_len, #0xF
+ sub w_len16, w_len16, w_len
+
+ /* less than 16 bytes? */
+ cbz w_len16, .Lloop1_init
+
+ sub x_src0_end, x_src0_end, #16
+
+ /* batch process (vects-1)*16 bytes */
+.Lloop16:
+ ldr q0, [x_src0], #16
+ cbz w_vects, .Lloop16_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop16_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_dst_ptr
+ ldr q1, [x_srcn, x_col]
+ eor v0.16b, v0.16b, v1.16b
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ cmp x_src0, x_src0_end
+ str q0, [x_dst], #16
+ add w_col, w_col, #16
+ bls .Lloop16
+
+.Loop16_end:
+ add x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+ cbnz w_len, .Lloop1
+ mov w_ret, #0
+ ret
+
+ /* batch process (vects-1)*1 bytes */
+.Lloop1:
+ ldrb w_xor, [x_src0], #1
+ cbz w_vects, .Lloop1_vects_end
+
+ add x_src_ptr, x_src, #8
+.Lloop1_vects:
+ ldr x_srcn, [x_src_ptr], #8
+ cmp x_src_ptr, x_dst_ptr
+ ldrb w_in, [x_srcn, x_col]
+ eor w_xor, w_xor, w_in
+ bne .Lloop1_vects
+
+.Lloop1_vects_end:
+ cmp x_src0, x_src0_end
+ strb w_xor, [x_dst], #1
+ add w_col, w_col, #1
+ bne .Lloop1
+
+.Loop1_end:
+ mov w_ret, #0
+ ret