7 files changed, 1291 insertions, 0 deletions
diff --git a/src/spdk/isa-l/raid/aarch64/Makefile.am b/src/spdk/isa-l/raid/aarch64/Makefile.am
new file mode 100644
index 000000000..d08c8d67a
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/Makefile.am
@@ -0,0 +1,36 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+lsrc_aarch64 += \
+		raid/aarch64/xor_gen_neon.S	\
+		raid/aarch64/pq_gen_neon.S	\
+		raid/aarch64/xor_check_neon.S	\
+		raid/aarch64/pq_check_neon.S	\
+		raid/aarch64/raid_multibinary_arm.S	\
+		raid/aarch64/raid_aarch64_dispatcher.c
diff --git a/src/spdk/isa-l/raid/aarch64/pq_check_neon.S b/src/spdk/isa-l/raid/aarch64/pq_check_neon.S
new file mode 100644
index 000000000..55ad79829
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/pq_check_neon.S
@@ -0,0 +1,341 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_check_neon
+.type pq_check_neon, %function
+
+/* int pq_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects		.req	w0	/* MUST >= 3 */
+x_vects		.req	x0
+w_len		.req	w1	/* MUST be 16x bytes */
+x_len		.req	x1
+x_src		.req	x2
+
+/* returns */
+w_ret		.req	w0
+
+/* local variables */
+x_dst_p		.req	x3
+x_dst_q		.req	x4
+x_dst_q_end  	.req	x5
+w_col		.req	w6
+x_col		.req	x6
+x_src_ptr	.req	x7
+x_src_ptr_end	.req	x9
+x_src_last	.req	x10
+x_srcn		.req	x11
+w_min		.req	w12
+/* vectors */
+/* v0  ~ v7 : temporary p */
+/* v8  ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0		.req	v24
+v_mask1		.req	v25
+v_mask2		.req	v26
+v_mask3		.req	v27
+v_gf8poly	.req	v28
+v_0x80		.req	v29
+
+/*
+ * src_ptr_end -->
+ *          -------+----------+
+ *           .     |  src[0]  |
+ *           .     +----------+            +------------------+
+ *     src_ptr --> |  src[1]  | - srcn ->  |     buffer       |
+ *           .     +----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+
+ *           .     | src[v-4] |
+ *          -------+----------+  src_last  +------------------+
+ *        src  --> | src[v-3] | ---------> |      buffer      |
+ *                 +----------+            +------------------+
+ *                 | src[v-2] | - dst_p -> |      buffer      |
+ *                 +----------+            +------------------+
+ *                 | src[v-1] | - dst_q -> |      buffer      | dst_q_end
+ *                 +----------+            +------------------+
+ */
+
+pq_check_neon:
+	sub	x_src_ptr_end, x_src, #8
+
+	sub	w_vects, w_vects, #3
+	add	x_src, x_src, x_vects, lsl #3
+
+	ldr	x_src_last, [x_src]
+	ldp	x_dst_p, x_dst_q, [x_src, #8]
+
+	add	x_dst_q_end, x_dst_q, x_len
+
+	mov	w_min, #-1
+	mov	w_col, #0
+	movi	v_gf8poly.16b, #0x1D
+	movi	v_0x80.16b, #0x80
+
+.Lloop128_init:
+	/* less than 128 byts? */
+	cmp	w_len, #128
+	blo	.Lloop16_init
+
+	/* save d8 ~ d15 to stack */
+	sub	sp, sp, #64
+	stp	d8,  d9,  [sp]
+	stp	d10, d11, [sp, #16]
+	stp	d12, d13, [sp, #32]
+	stp	d14, d15, [sp, #48]
+
+	sub	x_dst_q_end, x_dst_q_end, #128
+
+	/* batch process (vects-2)*128 bytes */
+	/* v0~v7: p;  v8~v15: q;  v16~v23: in */
+.Lloop128:
+	ldr	q0, [x_src_last, #16*0]
+	ldr	q1, [x_src_last, #16*1]
+	ldr	q2, [x_src_last, #16*2]
+	ldr	q3, [x_src_last, #16*3]
+	ldr	q4, [x_src_last, #16*4]
+	ldr	q5, [x_src_last, #16*5]
+	ldr	q6, [x_src_last, #16*6]
+	ldr	q7, [x_src_last, #16*7]
+	add	x_src_last, x_src_last, #128
+
+	mov	v8.16b,  v0.16b
+	mov	v9.16b,  v1.16b
+	mov	v10.16b, v2.16b
+	mov	v11.16b, v3.16b
+	mov	v12.16b, v4.16b
+	mov	v13.16b, v5.16b
+	mov	v14.16b, v6.16b
+	mov	v15.16b, v7.16b
+
+	cbz	w_vects, .Lloop128_vects_end
+
+	sub	x_src_ptr, x_src, #8
+.Lloop128_vects:
+	ldr	x_srcn, [x_src_ptr], #-8
+	add	x_srcn, x_srcn, x_col
+	cmp	x_src_ptr, x_src_ptr_end
+
+	ldr	q16, [x_srcn, #16*0]
+	ldr	q17, [x_srcn, #16*1]
+	ldr	q18, [x_srcn, #16*2]
+	ldr	q19, [x_srcn, #16*3]
+	ldr	q20, [x_srcn, #16*4]
+	ldr	q21, [x_srcn, #16*5]
+	ldr	q22, [x_srcn, #16*6]
+	ldr	q23, [x_srcn, #16*7]
+
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v1.16b, v1.16b, v17.16b
+	eor	v2.16b, v2.16b, v18.16b
+	eor	v3.16b, v3.16b, v19.16b
+	eor	v4.16b, v4.16b, v20.16b
+	eor	v5.16b, v5.16b, v21.16b
+	eor	v6.16b, v6.16b, v22.16b
+	eor	v7.16b, v7.16b, v23.16b
+
+	cmhs	v_mask0.16b, v8.16b,  v_0x80.16b
+	cmhs	v_mask1.16b, v9.16b,  v_0x80.16b
+	cmhs	v_mask2.16b, v10.16b, v_0x80.16b
+	cmhs	v_mask3.16b, v11.16b, v_0x80.16b
+	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+	shl	v8.16b,  v8.16b,  #1
+	shl	v9.16b,  v9.16b,  #1
+	shl	v10.16b, v10.16b, #1
+	shl	v11.16b, v11.16b, #1
+	eor	v8.16b,  v8.16b,  v_mask0.16b
+	eor	v9.16b,  v9.16b,  v_mask1.16b
+	eor	v10.16b, v10.16b, v_mask2.16b
+	eor	v11.16b, v11.16b, v_mask3.16b
+	eor	v8.16b,  v8.16b,  v16.16b
+	eor	v9.16b,  v9.16b,  v17.16b
+	eor	v10.16b, v10.16b, v18.16b
+	eor	v11.16b, v11.16b, v19.16b
+
+	cmhs	v_mask0.16b, v12.16b, v_0x80.16b
+	cmhs	v_mask1.16b, v13.16b, v_0x80.16b
+	cmhs	v_mask2.16b, v14.16b, v_0x80.16b
+	cmhs	v_mask3.16b, v15.16b, v_0x80.16b
+	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+	shl	v12.16b, v12.16b, #1
+	shl	v13.16b, v13.16b, #1
+	shl	v14.16b, v14.16b, #1
+	shl	v15.16b, v15.16b, #1
+	eor	v12.16b, v12.16b, v_mask0.16b
+	eor	v13.16b, v13.16b, v_mask1.16b
+	eor	v14.16b, v14.16b, v_mask2.16b
+	eor	v15.16b, v15.16b, v_mask3.16b
+	eor	v12.16b, v12.16b, v20.16b
+	eor	v13.16b, v13.16b, v21.16b
+	eor	v14.16b, v14.16b, v22.16b
+	eor	v15.16b, v15.16b, v23.16b
+
+	bne	.Lloop128_vects
+
+.Lloop128_vects_end:
+	/* v16~v23: true p, q */
+	ldr	q16, [x_dst_p, #16*0]
+	ldr	q17, [x_dst_p, #16*1]
+	ldr	q18, [x_dst_p, #16*2]
+	ldr	q19, [x_dst_p, #16*3]
+	ldr	q20, [x_dst_p, #16*4]
+	ldr	q21, [x_dst_p, #16*5]
+	ldr	q22, [x_dst_p, #16*6]
+	ldr	q23, [x_dst_p, #16*7]
+
+	cmeq	v0.16b, v0.16b, v16.16b
+	cmeq	v1.16b, v1.16b, v17.16b
+	cmeq	v2.16b, v2.16b, v18.16b
+	cmeq	v3.16b, v3.16b, v19.16b
+	cmeq	v4.16b, v4.16b, v20.16b
+	cmeq	v5.16b, v5.16b, v21.16b
+	cmeq	v6.16b, v6.16b, v22.16b
+	cmeq	v7.16b, v7.16b, v23.16b
+
+	ldr	q16, [x_dst_q, #16*0]
+	ldr	q17, [x_dst_q, #16*1]
+	ldr	q18, [x_dst_q, #16*2]
+	ldr	q19, [x_dst_q, #16*3]
+	ldr	q20, [x_dst_q, #16*4]
+	ldr	q21, [x_dst_q, #16*5]
+	ldr	q22, [x_dst_q, #16*6]
+	ldr	q23, [x_dst_q, #16*7]
+
+	and	v0.16b, v0.16b, v1.16b
+	and	v2.16b, v2.16b, v3.16b
+	and	v4.16b, v4.16b, v5.16b
+	and	v6.16b, v6.16b, v7.16b
+	and	v0.16b, v0.16b, v2.16b
+	and	v4.16b, v4.16b, v6.16b
+	and	v0.16b, v0.16b, v4.16b
+
+	cmeq	v8.16b,  v8.16b,  v16.16b
+	cmeq	v9.16b,  v9.16b,  v17.16b
+	cmeq	v10.16b, v10.16b, v18.16b
+	cmeq	v11.16b, v11.16b, v19.16b
+	cmeq	v12.16b, v12.16b, v20.16b
+	cmeq	v13.16b, v13.16b, v21.16b
+	cmeq	v14.16b, v14.16b, v22.16b
+	cmeq	v15.16b, v15.16b, v23.16b
+
+	and	v8.16b,  v8.16b,  v9.16b
+	and	v10.16b, v10.16b, v11.16b
+	and	v12.16b, v12.16b, v13.16b
+	and	v14.16b, v14.16b, v15.16b
+	and	v8.16b,  v8.16b,  v10.16b
+	and	v12.16b, v12.16b, v14.16b
+	and	v8.16b,  v8.16b,  v12.16b
+
+	and	v0.16b, v0.16b, v8.16b
+
+	uminv	b0, v0.16b
+	umov	w_min, v0.b[0]
+	cbz	w_min, .Lloop128_end
+
+	add	x_dst_p, x_dst_p, #128
+	add	x_dst_q, x_dst_q, #128
+	cmp	x_dst_q, x_dst_q_end
+	add	w_col, w_col, #128
+	bls	.Lloop128
+
+.Lloop128_end:
+	/* restore d8 ~ d15 */
+	ldp	d8,  d9,  [sp]
+	ldp	d10, d11, [sp, #16]
+	ldp	d12, d13, [sp, #32]
+	ldp	d14, d15, [sp, #48]
+	add	sp, sp, #64
+
+	cbz	w_min, .Lerror
+
+	add	x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+	tst	w_len, #0x7F
+	beq	.Lloop16_end
+	sub	x_dst_q_end, x_dst_q_end, #16
+
+	/* batch process (vects-2)*16 bytes */
+	/* v0: p;  v1: q;  v2: in;  v3: mask */
+.Lloop16:
+	ldr	q0, [x_src_last], #16
+	mov	v1.16b, v0.16b
+
+	cbz	w_vects, .Lloop16_vects_end
+
+	sub	x_src_ptr, x_src, #8
+.Lloop16_vects:
+	ldr	x_srcn, [x_src_ptr], #-8
+	ldr	q2, [x_srcn, x_col]
+	cmp	x_src_ptr, x_src_ptr_end
+
+	eor	v0.16b, v0.16b, v2.16b
+
+	cmhs	v3.16b, v1.16b, v_0x80.16b
+	and	v3.16b, v3.16b, v_gf8poly.16b
+
+	shl	v1.16b, v1.16b, #1
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v3.16b
+
+	bne	.Lloop16_vects
+
+.Lloop16_vects_end:
+	/* v4: true p;  v5: true q */
+	ldr	q4, [x_dst_p], #16
+	ldr	q5, [x_dst_q], #16
+	cmp	x_dst_q, x_dst_q_end
+
+	cmeq	v0.16b, v0.16b, v4.16b
+	cmeq	v1.16b, v1.16b, v5.16b
+	and	v0.16b, v0.16b, v1.16b
+
+	uminv	b0, v0.16b
+	umov	w_min, v0.b[0]
+	cbz	w_min, .Lerror
+
+	add	w_col, w_col, #16
+	bls	.Lloop16
+
+.Lloop16_end:
+	mov	w_ret, #0
+	ret
+
+.Lerror:
+	mov	w_ret, #1
+	ret
diff --git a/src/spdk/isa-l/raid/aarch64/pq_gen_neon.S b/src/spdk/isa-l/raid/aarch64/pq_gen_neon.S
new file mode 100644
index 000000000..f60ad1211
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/pq_gen_neon.S
@@ -0,0 +1,282 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global pq_gen_neon
+.type pq_gen_neon, %function
+
+/* int pq_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects		.req	w0	/* MUST >= 3 */
+x_vects		.req	x0
+w_len		.req	w1	/* MUST be 16x bytes */
+x_len		.req	x1
+x_src		.req	x2
+
+/* returns */
+w_ret		.req	w0
+
+/* local variables */
+x_dst_p		.req	x3
+x_dst_q		.req	x4
+x_dst_q_end  	.req	x5
+w_col		.req	w6
+x_col		.req	x6
+x_src_ptr	.req	x7
+x_src_ptr_end	.req	x9
+x_src_last	.req	x10
+x_srcn		.req	x11
+/* vectors */
+/* v0  ~ v7 : temporary p */
+/* v8  ~ v15: temporary q */
+/* v16 ~ v23: next 128 bytes */
+v_mask0		.req	v24
+v_mask1		.req	v25
+v_mask2		.req	v26
+v_mask3		.req	v27
+v_gf8poly	.req	v28
+v_0x80		.req	v29
+
+/*
+ * src_ptr_end -->
+ *          -------+----------+
+ *           .     |  src[0]  |
+ *           .     +----------+            +------------------+
+ *     src_ptr --> |  src[1]  | - srcn ->  |     buffer       |
+ *           .     +----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+
+ *           .     | src[v-4] |
+ *          -------+----------+  src_last  +------------------+
+ *        src  --> | src[v-3] | ---------> |      buffer      |
+ *                 +----------+            +------------------+
+ *                 | src[v-2] | - dst_p -> |      buffer      |
+ *                 +----------+            +------------------+
+ *                 | src[v-1] | - dst_q -> |      buffer      | dst_q_end
+ *                 +----------+            +------------------+
+ */
+
+pq_gen_neon:
+	sub	x_src_ptr_end, x_src, #8
+
+	sub	w_vects, w_vects, #3
+	add	x_src, x_src, x_vects, lsl #3
+
+	ldr	x_src_last, [x_src]
+	ldp	x_dst_p, x_dst_q, [x_src, #8]
+
+	add	x_dst_q_end, x_dst_q, x_len
+
+	mov	w_col, #0
+	movi	v_gf8poly.16b, #0x1D
+	movi	v_0x80.16b, #0x80
+
+.Lloop128_init:
+	/* less than 128 byts? */
+	cmp	w_len, #128
+	blo	.Lloop16_init
+
+	/* save d8 ~ d15 to stack */
+	sub	sp, sp, #64
+	stp	d8,  d9,  [sp]
+	stp	d10, d11, [sp, #16]
+	stp	d12, d13, [sp, #32]
+	stp	d14, d15, [sp, #48]
+
+	sub	x_dst_q_end, x_dst_q_end, #128
+
+	/* batch process (vects-2)*128 bytes */
+	/* v0~v7: p;  v8~v15: q;  v16~v23: in */
+.Lloop128:
+	ldr	q0, [x_src_last, #16*0]
+	ldr	q1, [x_src_last, #16*1]
+	ldr	q2, [x_src_last, #16*2]
+	ldr	q3, [x_src_last, #16*3]
+	ldr	q4, [x_src_last, #16*4]
+	ldr	q5, [x_src_last, #16*5]
+	ldr	q6, [x_src_last, #16*6]
+	ldr	q7, [x_src_last, #16*7]
+	add	x_src_last, x_src_last, #128
+
+	mov	v8.16b,  v0.16b
+	mov	v9.16b,  v1.16b
+	mov	v10.16b, v2.16b
+	mov	v11.16b, v3.16b
+	mov	v12.16b, v4.16b
+	mov	v13.16b, v5.16b
+	mov	v14.16b, v6.16b
+	mov	v15.16b, v7.16b
+
+	cbz	w_vects, .Lloop128_vects_end
+
+	sub	x_src_ptr, x_src, #8
+.Lloop128_vects:
+	ldr	x_srcn, [x_src_ptr], #-8
+	add	x_srcn, x_srcn, x_col
+	cmp	x_src_ptr, x_src_ptr_end
+
+	ldr	q16, [x_srcn, #16*0]
+	ldr	q17, [x_srcn, #16*1]
+	ldr	q18, [x_srcn, #16*2]
+	ldr	q19, [x_srcn, #16*3]
+	ldr	q20, [x_srcn, #16*4]
+	ldr	q21, [x_srcn, #16*5]
+	ldr	q22, [x_srcn, #16*6]
+	ldr	q23, [x_srcn, #16*7]
+
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v1.16b, v1.16b, v17.16b
+	eor	v2.16b, v2.16b, v18.16b
+	eor	v3.16b, v3.16b, v19.16b
+	eor	v4.16b, v4.16b, v20.16b
+	eor	v5.16b, v5.16b, v21.16b
+	eor	v6.16b, v6.16b, v22.16b
+	eor	v7.16b, v7.16b, v23.16b
+
+	cmhs	v_mask0.16b, v8.16b,  v_0x80.16b
+	cmhs	v_mask1.16b, v9.16b,  v_0x80.16b
+	cmhs	v_mask2.16b, v10.16b, v_0x80.16b
+	cmhs	v_mask3.16b, v11.16b, v_0x80.16b
+	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+	shl	v8.16b,  v8.16b,  #1
+	shl	v9.16b,  v9.16b,  #1
+	shl	v10.16b, v10.16b, #1
+	shl	v11.16b, v11.16b, #1
+	eor	v8.16b,  v8.16b,  v_mask0.16b
+	eor	v9.16b,  v9.16b,  v_mask1.16b
+	eor	v10.16b, v10.16b, v_mask2.16b
+	eor	v11.16b, v11.16b, v_mask3.16b
+	eor	v8.16b,  v8.16b,  v16.16b
+	eor	v9.16b,  v9.16b,  v17.16b
+	eor	v10.16b, v10.16b, v18.16b
+	eor	v11.16b, v11.16b, v19.16b
+
+	cmhs	v_mask0.16b, v12.16b, v_0x80.16b
+	cmhs	v_mask1.16b, v13.16b, v_0x80.16b
+	cmhs	v_mask2.16b, v14.16b, v_0x80.16b
+	cmhs	v_mask3.16b, v15.16b, v_0x80.16b
+	and	v_mask0.16b, v_mask0.16b, v_gf8poly.16b
+	and	v_mask1.16b, v_mask1.16b, v_gf8poly.16b
+	and	v_mask2.16b, v_mask2.16b, v_gf8poly.16b
+	and	v_mask3.16b, v_mask3.16b, v_gf8poly.16b
+	shl	v12.16b, v12.16b, #1
+	shl	v13.16b, v13.16b, #1
+	shl	v14.16b, v14.16b, #1
+	shl	v15.16b, v15.16b, #1
+	eor	v12.16b, v12.16b, v_mask0.16b
+	eor	v13.16b, v13.16b, v_mask1.16b
+	eor	v14.16b, v14.16b, v_mask2.16b
+	eor	v15.16b, v15.16b, v_mask3.16b
+	eor	v12.16b, v12.16b, v20.16b
+	eor	v13.16b, v13.16b, v21.16b
+	eor	v14.16b, v14.16b, v22.16b
+	eor	v15.16b, v15.16b, v23.16b
+
+	bne	.Lloop128_vects
+
+.Lloop128_vects_end:
+	str	q0, [x_dst_p, #16*0]
+	str	q1, [x_dst_p, #16*1]
+	str	q2, [x_dst_p, #16*2]
+	str	q3, [x_dst_p, #16*3]
+	str	q4, [x_dst_p, #16*4]
+	str	q5, [x_dst_p, #16*5]
+	str	q6, [x_dst_p, #16*6]
+	str	q7, [x_dst_p, #16*7]
+
+	str	q8,  [x_dst_q, #16*0]
+	str	q9,  [x_dst_q, #16*1]
+	str	q10, [x_dst_q, #16*2]
+	str	q11, [x_dst_q, #16*3]
+	str	q12, [x_dst_q, #16*4]
+	str	q13, [x_dst_q, #16*5]
+	str	q14, [x_dst_q, #16*6]
+	str	q15, [x_dst_q, #16*7]
+
+	add	x_dst_p, x_dst_p, #128
+	add	x_dst_q, x_dst_q, #128
+	cmp	x_dst_q, x_dst_q_end
+	add	w_col, w_col, #128
+	bls	.Lloop128
+
+.Lloop128_end:
+	/* restore d8 ~ d15 */
+	ldp	d8,  d9,  [sp]
+	ldp	d10, d11, [sp, #16]
+	ldp	d12, d13, [sp, #32]
+	ldp	d14, d15, [sp, #48]
+	add	sp, sp, #64
+
+	add	x_dst_q_end, x_dst_q_end, #128
+
+.Lloop16_init:
+	tst	w_len, #0x7F
+	beq	.Lloop16_end
+	sub	x_dst_q_end, x_dst_q_end, #16
+
+	/* batch process (vects-2)*16 bytes */
+	/* v0: p;  v1: q;  v2: in;  v3: mask */
+.Lloop16:
+	ldr	q0, [x_src_last], #16
+	mov	v1.16b, v0.16b
+
+	cbz	w_vects, .Lloop16_vects_end
+
+	sub	x_src_ptr, x_src, #8
+.Lloop16_vects:
+	ldr	x_srcn, [x_src_ptr], #-8
+	ldr	q2, [x_srcn, x_col]
+	cmp	x_src_ptr, x_src_ptr_end
+
+	eor	v0.16b, v0.16b, v2.16b
+
+	cmhs	v3.16b, v1.16b, v_0x80.16b
+	and	v3.16b, v3.16b, v_gf8poly.16b
+
+	shl	v1.16b, v1.16b, #1
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v3.16b
+
+	bne	.Lloop16_vects
+
+.Lloop16_vects_end:
+	str	q0, [x_dst_p], #16
+	str	q1, [x_dst_q], #16
+	cmp	x_dst_q, x_dst_q_end
+	add	w_col, w_col, #16
+	bls	.Lloop16
+
+.Lloop16_end:
+	mov	w_ret, #0
+	ret
diff --git a/src/spdk/isa-l/raid/aarch64/raid_aarch64_dispatcher.c b/src/spdk/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
new file mode 100644
index 000000000..c81bd8c98
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/raid_aarch64_dispatcher.c
@@ -0,0 +1,61 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(xor_gen)
+{
+	if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+		return PROVIDER_INFO(xor_gen_neon);
+	return PROVIDER_BASIC(xor_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(xor_check)
+{
+	if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+		return PROVIDER_INFO(xor_check_neon);
+	return PROVIDER_BASIC(xor_check);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_gen)
+{
+	if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+		return PROVIDER_INFO(pq_gen_neon);
+	return PROVIDER_BASIC(pq_gen);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(pq_check)
+{
+	if (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+		return PROVIDER_INFO(pq_check_neon);
+	return PROVIDER_BASIC(pq_check);
+
+}
diff --git a/src/spdk/isa-l/raid/aarch64/raid_multibinary_arm.S b/src/spdk/isa-l/raid/aarch64/raid_multibinary_arm.S
new file mode 100644
index 000000000..0316239ec
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/raid_multibinary_arm.S
@@ -0,0 +1,36 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface xor_gen
+mbin_interface xor_check
+mbin_interface pq_gen
+mbin_interface pq_check
diff --git a/src/spdk/isa-l/raid/aarch64/xor_check_neon.S b/src/spdk/isa-l/raid/aarch64/xor_check_neon.S
new file mode 100644
index 000000000..95cb7d1d1
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/xor_check_neon.S
@@ -0,0 +1,271 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_check_neon
+.type xor_check_neon, %function
+
+/* int xor_check_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects		.req	w0	/* MUST >= 2 */
+x_vects		.req	x0
+w_len		.req	w1
+x_len		.req	x1
+x_src		.req	x2
+
+/* returns */
+w_ret		.req	w0
+
+/* local variables */
+w_in		.req	w1	/* share w_len */
+x_src0		.req	x3
+x_src0_end  	.req	x4
+w_len256	.req	w5	/* share w_len16 */
+x_len256	.req	x5
+w_len16		.req	w5
+x_len16		.req	x5
+w_col		.req	w6
+x_col		.req	x6
+x_src_ptr	.req	x7
+x_srcn		.req	x9
+x_src_ptr_end	.req	x10
+w_xor		.req	w11
+/* v0  ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ *                 +----------+            +------------------+
+ *         src --> |  src[0]  | - src0 ->  |      buffer      | src0_end
+ *         --------+----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+            +------------------+
+ *     src_ptr ~~> |  src[n]  | - srcn ~>  |      buffer      |
+ *           .     +----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+
+ *           .     | src[v-1] |
+ *         --------+----------+
+ *  src_ptr_end -->
+ */
+
+xor_check_neon:
+	add	x_src_ptr_end, x_src, x_vects, lsl #3
+	ldr	x_src0, [x_src]
+	add	x_src0_end, x_src0, x_len
+
+	sub	w_vects, w_vects, #1
+	mov	w_col, #0
+	mov	w_xor, #0
+
+.Lloop256_init:
+	/* len256 = len - len%256; len %= 256 */
+	mov	w_len256, w_len
+	and	w_len, w_len, #0xFF
+	sub	w_len256, w_len256, w_len
+
+	/* less than 256 byts? */
+	cbz	w_len256, .Lloop16_init
+
+	/* save d8 ~ d15 to stack */
+	sub	sp, sp, #64
+	stp	d8, d9, [sp]
+	stp	d10, d11, [sp, #16]
+	stp	d12, d13, [sp, #32]
+	stp	d14, d15, [sp, #48]
+
+	sub	x_src0_end, x_src0_end, #256
+
+	/* batch process vects*256 bytes */
+.Lloop256:
+	ldr	q0,  [x_src0, #16*0]
+	ldr	q1,  [x_src0, #16*1]
+	ldr	q2,  [x_src0, #16*2]
+	ldr	q3,  [x_src0, #16*3]
+	ldr	q4,  [x_src0, #16*4]
+	ldr	q5,  [x_src0, #16*5]
+	ldr	q6,  [x_src0, #16*6]
+	ldr	q7,  [x_src0, #16*7]
+	ldr	q8,  [x_src0, #16*8]
+	ldr	q9,  [x_src0, #16*9]
+	ldr	q10, [x_src0, #16*10]
+	ldr	q11, [x_src0, #16*11]
+	ldr	q12, [x_src0, #16*12]
+	ldr	q13, [x_src0, #16*13]
+	ldr	q14, [x_src0, #16*14]
+	ldr	q15, [x_src0, #16*15]
+	add	x_src0, x_src0, #256
+
+	cbz	w_vects, .Lloop256_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop256_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	add	x_srcn, x_srcn, x_col
+	cmp	x_src_ptr, x_src_ptr_end
+
+	ldr	q16, [x_srcn, #16*0]
+	ldr	q17, [x_srcn, #16*1]
+	ldr	q18, [x_srcn, #16*2]
+	ldr	q19, [x_srcn, #16*3]
+	ldr	q20, [x_srcn, #16*4]
+	ldr	q21, [x_srcn, #16*5]
+	ldr	q22, [x_srcn, #16*6]
+	ldr	q23, [x_srcn, #16*7]
+	ldr	q24, [x_srcn, #16*8]
+	ldr	q25, [x_srcn, #16*9]
+	ldr	q26, [x_srcn, #16*10]
+	ldr	q27, [x_srcn, #16*11]
+	ldr	q28, [x_srcn, #16*12]
+	ldr	q29, [x_srcn, #16*13]
+	ldr	q30, [x_srcn, #16*14]
+	ldr	q31, [x_srcn, #16*15]
+
+	eor	v0.16b,  v0.16b,  v16.16b
+	eor	v1.16b,  v1.16b,  v17.16b
+	eor	v2.16b,  v2.16b,  v18.16b
+	eor	v3.16b,  v3.16b,  v19.16b
+	eor	v4.16b,  v4.16b,  v20.16b
+	eor	v5.16b,  v5.16b,  v21.16b
+	eor	v6.16b,  v6.16b,  v22.16b
+	eor	v7.16b,  v7.16b,  v23.16b
+	eor	v8.16b,  v8.16b,  v24.16b
+	eor	v9.16b,  v9.16b,  v25.16b
+	eor	v10.16b, v10.16b, v26.16b
+	eor	v11.16b, v11.16b, v27.16b
+	eor	v12.16b, v12.16b, v28.16b
+	eor	v13.16b, v13.16b, v29.16b
+	eor	v14.16b, v14.16b, v30.16b
+	eor	v15.16b, v15.16b, v31.16b
+
+	bne	.Lloop256_vects
+
+.Lloop256_vects_end:
+	orr	v0.16b,  v0.16b,  v1.16b
+	orr	v2.16b,  v2.16b,  v3.16b
+	orr	v4.16b,  v4.16b,  v5.16b
+	orr	v6.16b,  v6.16b,  v7.16b
+	orr	v8.16b,  v8.16b,  v9.16b
+	orr	v10.16b, v10.16b, v11.16b
+	orr	v12.16b, v12.16b, v13.16b
+	orr	v14.16b, v14.16b, v15.16b
+	orr	v0.16b,  v0.16b,  v2.16b
+	orr	v4.16b,  v4.16b,  v6.16b
+	orr	v8.16b,  v8.16b,  v10.16b
+	orr	v12.16b, v12.16b, v14.16b
+	orr	v0.16b,  v0.16b,  v4.16b
+	orr	v8.16b,  v8.16b,  v12.16b
+	orr	v0.16b,  v0.16b,  v8.16b
+	umaxv	b0, v0.16b
+	umov	w_xor, v0.b[0]
+	cbnz	w_xor, .Lloop256_end
+
+	cmp	x_src0, x_src0_end
+	add	w_col, w_col, #256
+	bls	.Lloop256
+
+.Lloop256_end:
+	/* restore d8 ~ d15 */
+	ldp	d8, d9, [sp]
+	ldp	d10, d11, [sp, #16]
+	ldp	d12, d13, [sp, #32]
+	ldp	d14, d15, [sp, #48]
+	add	sp, sp, #64
+
+	cbnz	w_xor, .Lerror
+
+	add	x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+	/* len16 = len - len%16; len %= 16 */
+	mov	w_len16, w_len
+	and	w_len, w_len, #0xF
+	sub	w_len16, w_len16, w_len
+
+	/* less than 16 bytes? */
+	cbz	w_len16, .Lloop1_init
+
+	sub	x_src0_end, x_src0_end, #16
+
+	/* batch process vects*16 bytes */
+.Lloop16:
+	ldr	q0, [x_src0], #16
+	cbz	w_vects, .Lloop16_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop16_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	cmp	x_src_ptr, x_src_ptr_end
+	ldr	q1, [x_srcn, x_col]
+	eor	v0.16b, v0.16b, v1.16b
+	bne	.Lloop16_vects
+
+.Lloop16_vects_end:
+	umaxv	b0, v0.16b
+	umov	w_xor, v0.b[0]
+	cbnz	w_xor, .Lerror
+	cmp	x_src0, x_src0_end
+	add	w_col, w_col, #16
+	bls	.Lloop16
+
+.Lloop16_end:
+	add	x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+	cbnz	w_len, .Lloop1
+	mov	w_ret, #0
+	ret
+
+	/* batch process vects*1 bytes */
+.Lloop1:
+	ldrb	w_xor, [x_src0], #1
+	cbz	w_vects, .Lloop1_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop1_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	cmp	x_src_ptr, x_src_ptr_end
+	ldrb	w_in, [x_srcn, x_col]
+	eor	w_xor, w_xor, w_in
+	bne	.Lloop1_vects
+
+.Lloop1_vects_end:
+	cbnz	w_xor, .Lerror
+	cmp	x_src0, x_src0_end
+	add	w_col, w_col, #1
+	bne	.Lloop1
+
+.Lloop1_end:
+	mov	w_ret, #0
+	ret
+
+.Lerror:
+	mov	w_ret, #1
+	ret
diff --git a/src/spdk/isa-l/raid/aarch64/xor_gen_neon.S b/src/spdk/isa-l/raid/aarch64/xor_gen_neon.S
new file mode 100644
index 000000000..00f65a2ef
--- /dev/null
+++ b/src/spdk/isa-l/raid/aarch64/xor_gen_neon.S
@@ -0,0 +1,264 @@
+########################################################################
+#  Copyright(c) 2019 Arm Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Arm Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#########################################################################
+
+.text
+
+.global xor_gen_neon
+.type xor_gen_neon, %function
+
+/* int xor_gen_neon(int vects, int len, void **src) */
+
+/* arguments */
+w_vects		.req	w0	/* MUST >= 2 */
+x_vects		.req	x0
+w_len		.req	w1
+x_len		.req	x1
+x_src		.req	x2
+
+/* returns */
+w_ret		.req	w0
+
+/* local variables */
+w_in		.req	w1	/* share w_len */
+x_src0		.req	x3
+x_src0_end  	.req	x4
+w_len256	.req	w5	/* share w_len16, w_xor */
+x_len256	.req	x5
+w_len16		.req	w5
+x_len16		.req	x5
+w_xor		.req	w5
+w_col		.req	w6
+x_col		.req	x6
+x_src_ptr	.req	x7
+x_srcn		.req	x9
+x_dst		.req	x10
+x_dst_ptr	.req	x11
+/* v0  ~ v15: temporary results */
+/* v16 ~ v31: next 256 bytes */
+
+/*
+ *                 +----------+            +------------------+
+ *         src --> |  src[0]  | - src0 ->  |      buffer      | src0_end
+ *         --------+----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+            +------------------+
+ *     src_ptr ~~> |  src[n]  | - srcn ~>  |      buffer      |
+ *           .     +----------+            +------------------+
+ *           .     |  ......  |
+ *           .     +----------+
+ *           .     | src[v-2] |
+ *         --------+----------+            +------------------+
+ *     dst_ptr --> | src[v-1] | -- dst --> |      buffer      |
+ *                 +----------+            +------------------+
+ */
+
+xor_gen_neon:
+	add	x_dst_ptr, x_src, x_vects, lsl #3
+	ldr	x_dst, [x_dst_ptr, #-8]!
+	ldr	x_src0, [x_src]
+	add	x_src0_end, x_src0, x_len
+
+	sub	w_vects, w_vects, #2
+	mov	w_col, #0
+
+.Loop256_init:
+	/* len256 = len - len%256; len %= 256 */
+	mov	w_len256, w_len
+	and	w_len, w_len, #0xFF
+	sub	w_len256, w_len256, w_len
+
+	/* less than 256 byts? */
+	cbz	w_len256, .Lloop16_init
+
+	/* save d8 ~ d15 to stack */
+	sub	sp, sp, #64
+	stp	d8, d9, [sp]
+	stp	d10, d11, [sp, #16]
+	stp	d12, d13, [sp, #32]
+	stp	d14, d15, [sp, #48]
+
+	sub	x_src0_end, x_src0_end, #256
+
+	/* batch process (vects-1)*256 bytes */
+.Lloop256:
+	ldr	q0,  [x_src0, #16*0]
+	ldr	q1,  [x_src0, #16*1]
+	ldr	q2,  [x_src0, #16*2]
+	ldr	q3,  [x_src0, #16*3]
+	ldr	q4,  [x_src0, #16*4]
+	ldr	q5,  [x_src0, #16*5]
+	ldr	q6,  [x_src0, #16*6]
+	ldr	q7,  [x_src0, #16*7]
+	ldr	q8,  [x_src0, #16*8]
+	ldr	q9,  [x_src0, #16*9]
+	ldr	q10, [x_src0, #16*10]
+	ldr	q11, [x_src0, #16*11]
+	ldr	q12, [x_src0, #16*12]
+	ldr	q13, [x_src0, #16*13]
+	ldr	q14, [x_src0, #16*14]
+	ldr	q15, [x_src0, #16*15]
+	add	x_src0, x_src0, #256
+
+	cbz	w_vects, .Lloop256_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop256_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	add	x_srcn, x_srcn, x_col
+	cmp	x_src_ptr, x_dst_ptr
+
+	ldr	q16, [x_srcn, #16*0]
+	ldr	q17, [x_srcn, #16*1]
+	ldr	q18, [x_srcn, #16*2]
+	ldr	q19, [x_srcn, #16*3]
+	ldr	q20, [x_srcn, #16*4]
+	ldr	q21, [x_srcn, #16*5]
+	ldr	q22, [x_srcn, #16*6]
+	ldr	q23, [x_srcn, #16*7]
+	ldr	q24, [x_srcn, #16*8]
+	ldr	q25, [x_srcn, #16*9]
+	ldr	q26, [x_srcn, #16*10]
+	ldr	q27, [x_srcn, #16*11]
+	ldr	q28, [x_srcn, #16*12]
+	ldr	q29, [x_srcn, #16*13]
+	ldr	q30, [x_srcn, #16*14]
+	ldr	q31, [x_srcn, #16*15]
+
+	eor	v0.16b,  v0.16b,  v16.16b
+	eor	v1.16b,  v1.16b,  v17.16b
+	eor	v2.16b,  v2.16b,  v18.16b
+	eor	v3.16b,  v3.16b,  v19.16b
+	eor	v4.16b,  v4.16b,  v20.16b
+	eor	v5.16b,  v5.16b,  v21.16b
+	eor	v6.16b,  v6.16b,  v22.16b
+	eor	v7.16b,  v7.16b,  v23.16b
+	eor	v8.16b,  v8.16b,  v24.16b
+	eor	v9.16b,  v9.16b,  v25.16b
+	eor	v10.16b, v10.16b, v26.16b
+	eor	v11.16b, v11.16b, v27.16b
+	eor	v12.16b, v12.16b, v28.16b
+	eor	v13.16b, v13.16b, v29.16b
+	eor	v14.16b, v14.16b, v30.16b
+	eor	v15.16b, v15.16b, v31.16b
+
+	bne	.Lloop256_vects
+
+.Lloop256_vects_end:
+	str	q0,  [x_dst, #16*0]
+	str	q1,  [x_dst, #16*1]
+	str	q2,  [x_dst, #16*2]
+	str	q3,  [x_dst, #16*3]
+	str	q4,  [x_dst, #16*4]
+	str	q5,  [x_dst, #16*5]
+	str	q6,  [x_dst, #16*6]
+	str	q7,  [x_dst, #16*7]
+	str	q8,  [x_dst, #16*8]
+	str	q9,  [x_dst, #16*9]
+	str	q10, [x_dst, #16*10]
+	str	q11, [x_dst, #16*11]
+	str	q12, [x_dst, #16*12]
+	str	q13, [x_dst, #16*13]
+	str	q14, [x_dst, #16*14]
+	str	q15, [x_dst, #16*15]
+
+	cmp	x_src0, x_src0_end
+	add	x_dst, x_dst, #256
+	add	w_col, w_col, #256
+	bls	.Lloop256
+
+.Lloop256_end:
+	/* restore d8 ~ d15 */
+	ldp	d8, d9, [sp]
+	ldp	d10, d11, [sp, #16]
+	ldp	d12, d13, [sp, #32]
+	ldp	d14, d15, [sp, #48]
+	add	sp, sp, #64
+
+	add	x_src0_end, x_src0_end, #256
+
+.Lloop16_init:
+	/* len16 = len - len%16; len %= 16 */
+	mov	w_len16, w_len
+	and	w_len, w_len, #0xF
+	sub	w_len16, w_len16, w_len
+
+	/* less than 16 bytes? */
+	cbz	w_len16, .Lloop1_init
+
+	sub	x_src0_end, x_src0_end, #16
+
+	/* batch process (vects-1)*16 bytes */
+.Lloop16:
+	ldr	q0, [x_src0], #16
+	cbz	w_vects, .Lloop16_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop16_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	cmp	x_src_ptr, x_dst_ptr
+	ldr	q1, [x_srcn, x_col]
+	eor	v0.16b, v0.16b, v1.16b
+	bne	.Lloop16_vects
+
+.Lloop16_vects_end:
+	cmp	x_src0, x_src0_end
+	str	q0, [x_dst], #16
+	add	w_col, w_col, #16
+	bls	.Lloop16
+
+.Loop16_end:
+	add	x_src0_end, x_src0_end, #16
+
+.Lloop1_init:
+	cbnz	w_len, .Lloop1
+	mov	w_ret, #0
+	ret
+
+	/* batch process (vects-1)*1 bytes */
+.Lloop1:
+	ldrb	w_xor, [x_src0], #1
+	cbz	w_vects, .Lloop1_vects_end
+
+	add	x_src_ptr, x_src, #8
+.Lloop1_vects:
+	ldr	x_srcn, [x_src_ptr], #8
+	cmp	x_src_ptr, x_dst_ptr
+	ldrb	w_in, [x_srcn, x_col]
+	eor	w_xor, w_xor, w_in
+	bne	.Lloop1_vects
+
+.Lloop1_vects_end:
+	cmp	x_src0, x_src0_end
+	strb	w_xor, [x_dst], #1
+	add	w_col, w_col, #1
+	bne	.Lloop1
+
+.Loop1_end:
+	mov	w_ret, #0
+	ret