From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/isa-l/raid/aarch64/pq_check_neon.S | 341 +++++++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 src/isa-l/raid/aarch64/pq_check_neon.S (limited to 'src/isa-l/raid/aarch64/pq_check_neon.S') diff --git a/src/isa-l/raid/aarch64/pq_check_neon.S b/src/isa-l/raid/aarch64/pq_check_neon.S new file mode 100644 index 000000000..55ad79829 --- /dev/null +++ b/src/isa-l/raid/aarch64/pq_check_neon.S @@ -0,0 +1,341 @@ +######################################################################## +# Copyright(c) 2019 Arm Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################### + +.text + +.global pq_check_neon +.type pq_check_neon, %function + +/* int pq_check_neon(int vects, int len, void **src) */ + +/* arguments */ +w_vects .req w0 /* MUST >= 3 */ +x_vects .req x0 +w_len .req w1 /* MUST be 16x bytes */ +x_len .req x1 +x_src .req x2 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dst_p .req x3 +x_dst_q .req x4 +x_dst_q_end .req x5 +w_col .req w6 +x_col .req x6 +x_src_ptr .req x7 +x_src_ptr_end .req x9 +x_src_last .req x10 +x_srcn .req x11 +w_min .req w12 +/* vectors */ +/* v0 ~ v7 : temporary p */ +/* v8 ~ v15: temporary q */ +/* v16 ~ v23: next 128 bytes */ +v_mask0 .req v24 +v_mask1 .req v25 +v_mask2 .req v26 +v_mask3 .req v27 +v_gf8poly .req v28 +v_0x80 .req v29 + +/* + * src_ptr_end --> + * -------+----------+ + * . | src[0] | + * . +----------+ +------------------+ + * src_ptr --> | src[1] | - srcn -> | buffer | + * . +----------+ +------------------+ + * . | ...... | + * . +----------+ + * . | src[v-4] | + * -------+----------+ src_last +------------------+ + * src --> | src[v-3] | ---------> | buffer | + * +----------+ +------------------+ + * | src[v-2] | - dst_p -> | buffer | + * +----------+ +------------------+ + * | src[v-1] | - dst_q -> | buffer | dst_q_end + * +----------+ +------------------+ + */ + +pq_check_neon: + sub x_src_ptr_end, x_src, #8 + + sub w_vects, w_vects, #3 + add x_src, x_src, x_vects, lsl #3 + + ldr x_src_last, [x_src] + ldp x_dst_p, x_dst_q, [x_src, #8] + + add x_dst_q_end, x_dst_q, x_len + + mov w_min, #-1 + mov w_col, #0 + movi v_gf8poly.16b, #0x1D + movi v_0x80.16b, #0x80 + +.Lloop128_init: + /* less than 128 byts? */ + cmp w_len, #128 + blo .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_dst_q_end, x_dst_q_end, #128 + + /* batch process (vects-2)*128 bytes */ + /* v0~v7: p; v8~v15: q; v16~v23: in */ +.Lloop128: + ldr q0, [x_src_last, #16*0] + ldr q1, [x_src_last, #16*1] + ldr q2, [x_src_last, #16*2] + ldr q3, [x_src_last, #16*3] + ldr q4, [x_src_last, #16*4] + ldr q5, [x_src_last, #16*5] + ldr q6, [x_src_last, #16*6] + ldr q7, [x_src_last, #16*7] + add x_src_last, x_src_last, #128 + + mov v8.16b, v0.16b + mov v9.16b, v1.16b + mov v10.16b, v2.16b + mov v11.16b, v3.16b + mov v12.16b, v4.16b + mov v13.16b, v5.16b + mov v14.16b, v6.16b + mov v15.16b, v7.16b + + cbz w_vects, .Lloop128_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop128_vects: + ldr x_srcn, [x_src_ptr], #-8 + add x_srcn, x_srcn, x_col + cmp x_src_ptr, x_src_ptr_end + + ldr q16, [x_srcn, #16*0] + ldr q17, [x_srcn, #16*1] + ldr q18, [x_srcn, #16*2] + ldr q19, [x_srcn, #16*3] + ldr q20, [x_srcn, #16*4] + ldr q21, [x_srcn, #16*5] + ldr q22, [x_srcn, #16*6] + ldr q23, [x_srcn, #16*7] + + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + eor v4.16b, v4.16b, v20.16b + eor v5.16b, v5.16b, v21.16b + eor v6.16b, v6.16b, v22.16b + eor v7.16b, v7.16b, v23.16b + + cmhs v_mask0.16b, v8.16b, v_0x80.16b + cmhs v_mask1.16b, v9.16b, v_0x80.16b + cmhs v_mask2.16b, v10.16b, v_0x80.16b + cmhs v_mask3.16b, v11.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v8.16b, v8.16b, #1 + shl v9.16b, v9.16b, #1 + shl v10.16b, v10.16b, #1 + shl v11.16b, v11.16b, #1 + eor v8.16b, v8.16b, v_mask0.16b + eor v9.16b, v9.16b, v_mask1.16b + eor v10.16b, v10.16b, v_mask2.16b + eor v11.16b, v11.16b, v_mask3.16b + eor v8.16b, v8.16b, v16.16b + eor v9.16b, v9.16b, v17.16b + eor v10.16b, v10.16b, v18.16b + eor v11.16b, v11.16b, v19.16b + + cmhs v_mask0.16b, v12.16b, v_0x80.16b + cmhs v_mask1.16b, v13.16b, v_0x80.16b + cmhs v_mask2.16b, v14.16b, v_0x80.16b + cmhs v_mask3.16b, v15.16b, v_0x80.16b + and v_mask0.16b, v_mask0.16b, v_gf8poly.16b + and v_mask1.16b, v_mask1.16b, v_gf8poly.16b + and v_mask2.16b, v_mask2.16b, v_gf8poly.16b + and v_mask3.16b, v_mask3.16b, v_gf8poly.16b + shl v12.16b, v12.16b, #1 + shl v13.16b, v13.16b, #1 + shl v14.16b, v14.16b, #1 + shl v15.16b, v15.16b, #1 + eor v12.16b, v12.16b, v_mask0.16b + eor v13.16b, v13.16b, v_mask1.16b + eor v14.16b, v14.16b, v_mask2.16b + eor v15.16b, v15.16b, v_mask3.16b + eor v12.16b, v12.16b, v20.16b + eor v13.16b, v13.16b, v21.16b + eor v14.16b, v14.16b, v22.16b + eor v15.16b, v15.16b, v23.16b + + bne .Lloop128_vects + +.Lloop128_vects_end: + /* v16~v23: true p, q */ + ldr q16, [x_dst_p, #16*0] + ldr q17, [x_dst_p, #16*1] + ldr q18, [x_dst_p, #16*2] + ldr q19, [x_dst_p, #16*3] + ldr q20, [x_dst_p, #16*4] + ldr q21, [x_dst_p, #16*5] + ldr q22, [x_dst_p, #16*6] + ldr q23, [x_dst_p, #16*7] + + cmeq v0.16b, v0.16b, v16.16b + cmeq v1.16b, v1.16b, v17.16b + cmeq v2.16b, v2.16b, v18.16b + cmeq v3.16b, v3.16b, v19.16b + cmeq v4.16b, v4.16b, v20.16b + cmeq v5.16b, v5.16b, v21.16b + cmeq v6.16b, v6.16b, v22.16b + cmeq v7.16b, v7.16b, v23.16b + + ldr q16, [x_dst_q, #16*0] + ldr q17, [x_dst_q, #16*1] + ldr q18, [x_dst_q, #16*2] + ldr q19, [x_dst_q, #16*3] + ldr q20, [x_dst_q, #16*4] + ldr q21, [x_dst_q, #16*5] + ldr q22, [x_dst_q, #16*6] + ldr q23, [x_dst_q, #16*7] + + and v0.16b, v0.16b, v1.16b + and v2.16b, v2.16b, v3.16b + and v4.16b, v4.16b, v5.16b + and v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v2.16b + and v4.16b, v4.16b, v6.16b + and v0.16b, v0.16b, v4.16b + + cmeq v8.16b, v8.16b, v16.16b + cmeq v9.16b, v9.16b, v17.16b + cmeq v10.16b, v10.16b, v18.16b + cmeq v11.16b, v11.16b, v19.16b + cmeq v12.16b, v12.16b, v20.16b + cmeq v13.16b, v13.16b, v21.16b + cmeq v14.16b, v14.16b, v22.16b + cmeq v15.16b, v15.16b, v23.16b + + and v8.16b, v8.16b, v9.16b + and v10.16b, v10.16b, v11.16b + and v12.16b, v12.16b, v13.16b + and v14.16b, v14.16b, v15.16b + and v8.16b, v8.16b, v10.16b + and v12.16b, v12.16b, v14.16b + and v8.16b, v8.16b, v12.16b + + and v0.16b, v0.16b, v8.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lloop128_end + + add x_dst_p, x_dst_p, #128 + add x_dst_q, x_dst_q, #128 + cmp x_dst_q, x_dst_q_end + add w_col, w_col, #128 + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + cbz w_min, .Lerror + + add x_dst_q_end, x_dst_q_end, #128 + +.Lloop16_init: + tst w_len, #0x7F + beq .Lloop16_end + sub x_dst_q_end, x_dst_q_end, #16 + + /* batch process (vects-2)*16 bytes */ + /* v0: p; v1: q; v2: in; v3: mask */ +.Lloop16: + ldr q0, [x_src_last], #16 + mov v1.16b, v0.16b + + cbz w_vects, .Lloop16_vects_end + + sub x_src_ptr, x_src, #8 +.Lloop16_vects: + ldr x_srcn, [x_src_ptr], #-8 + ldr q2, [x_srcn, x_col] + cmp x_src_ptr, x_src_ptr_end + + eor v0.16b, v0.16b, v2.16b + + cmhs v3.16b, v1.16b, v_0x80.16b + and v3.16b, v3.16b, v_gf8poly.16b + + shl v1.16b, v1.16b, #1 + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + + bne .Lloop16_vects + +.Lloop16_vects_end: + /* v4: true p; v5: true q */ + ldr q4, [x_dst_p], #16 + ldr q5, [x_dst_q], #16 + cmp x_dst_q, x_dst_q_end + + cmeq v0.16b, v0.16b, v4.16b + cmeq v1.16b, v1.16b, v5.16b + and v0.16b, v0.16b, v1.16b + + uminv b0, v0.16b + umov w_min, v0.b[0] + cbz w_min, .Lerror + + add w_col, w_col, #16 + bls .Lloop16 + +.Lloop16_end: + mov w_ret, #0 + ret + +.Lerror: + mov w_ret, #1 + ret -- cgit v1.2.3