diff options
Diffstat (limited to 'third_party/dav1d/src/arm/64/refmvs.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/refmvs.S | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S new file mode 100644 index 0000000000..e905682f47 --- /dev/null +++ b/third_party/dav1d/src/arm/64/refmvs.S @@ -0,0 +1,292 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, +// int bx4, int bw4, int bh4) + +function splat_mv_neon, export=1 + ld1 {v3.16b}, [x1] + clz w3, w3 + adr x5, L(splat_tbl) + sub w3, w3, #26 + ext v2.16b, v3.16b, v3.16b, #12 + ldrh w3, [x5, w3, uxtw #1] + add w2, w2, w2, lsl #1 + ext v0.16b, v2.16b, v3.16b, #4 + sub x3, x5, w3, uxtw + ext v1.16b, v2.16b, v3.16b, #8 + lsl w2, w2, #2 + ext v2.16b, v2.16b, v3.16b, #12 +1: + ldr x1, [x0], #8 + subs w4, w4, #1 + add x1, x1, x2 + br x3 + +10: + AARCH64_VALID_JUMP_TARGET + st1 {v0.8b}, [x1] + str s2, [x1, #8] + b.gt 1b + ret +20: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b}, [x1] + str d1, [x1, #16] + b.gt 1b + ret +320: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +160: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +80: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +40: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1] + b.gt 1b + ret + +L(splat_tbl): + .hword L(splat_tbl) - 320b + .hword L(splat_tbl) - 160b + .hword L(splat_tbl) - 80b + .hword L(splat_tbl) - 40b + .hword L(splat_tbl) - 20b + .hword L(splat_tbl) - 10b +endfunc + +const mv_tbls, align=4 + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 +endconst + +const mask_mult, align=4 + .byte 1, 2, 1, 2, 0, 0, 0, 0 +endconst + +// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, +// refmvs_block **rr, const uint8_t *ref_sign, +// int col_end8, int row_end8, +// int col_start8, int row_start8) +function save_tmvs_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-16]! + mov x29, sp + + movi v30.8b, #0 + ld1 {v31.8b}, [x3] + adr x8, L(save_tmvs_tbl) + movrel x16, mask_mult + movrel x13, mv_tbls + ld1 {v29.8b}, [x16] + ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] + mov w15, #5 + mov w14, #12*2 + sxtw x4, w4 + sxtw x6, w6 + mul w1, w1, w15 // stride *= 5 + sub w5, w5, w7 // h = row_end8 - row_start8 + lsl w7, w7, #1 // row_start8 <<= 1 +1: + mov w15, #5 + and w9, w7, #30 // (y & 15) * 2 + ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] + add x9, x9, #12 // &b[... + 1] + madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] + madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] + + madd x3, x6, x15, x0 // &rp[x] + +2: + ldrb w11, [x9, #10] // cand_b->bs + ld1 {v0.16b}, [x9] // cand_b->mv + add x11, x8, w11, uxtw #2 + ldr h1, [x9, #8] // cand_b->ref + ldrh w12, [x11] // bw8 + mov x15, x8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + cmp x9, x10 + mov v2.8b, v0.8b + b.ge 3f + + ldrb w15, [x9, #10] // cand_b->bs + add x16, x9, #8 + ld1 {v4.16b}, [x9] // cand_b->mv + add x15, x8, w15, uxtw #2 + ld1 {v1.h}[1], [x16] // cand_b->ref + ldrh w12, [x15] // bw8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + trn1 v2.2d, v0.2d, v4.2d + +3: + abs v2.8h, v2.8h // abs(mv[].xy) + tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] + ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 + umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} + cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 + xtn v2.4h, v2.4s // abs() condition to 16 bit + and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] + addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] + umov w16, v1.h[0] // Extract case for first block + umov w17, v1.h[1] + ldrh w11, [x11, #2] // Fetch jump table entry + ldrh w15, [x15, #2] + ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case + ldr q5, [x13, w17, uxtw #4] + sub x11, x8, w11, uxtw // Find jump table target + sub x15, x8, w15, uxtw + tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block + tbl v4.16b, {v4.16b}, v5.16b + + // v1 follows on v0, with another 3 full repetitions of the pattern. + ext v1.16b, v0.16b, v0.16b, #1 + ext v5.16b, v4.16b, v4.16b, #1 + // v2 ends with 3 complete repetitions of the pattern. + ext v2.16b, v0.16b, v1.16b, #4 + ext v6.16b, v4.16b, v5.16b, #4 + + blr x11 + b.ge 4f // if (cand_b >= end) + mov v0.16b, v4.16b + mov v1.16b, v5.16b + mov v2.16b, v6.16b + cmp x9, x10 + blr x15 + b.lt 2b // if (cand_b < end) + +4: + subs w5, w5, #1 // h-- + add w7, w7, #2 // y += 2 + add x0, x0, x1 // rp += stride + b.gt 1b + + ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +10: + AARCH64_VALID_CALL_TARGET + add x16, x3, #4 + st1 {v0.s}[0], [x3] + st1 {v0.b}[4], [x16] + add x3, x3, #5 + ret +20: + AARCH64_VALID_CALL_TARGET + add x16, x3, #8 + st1 {v0.d}[0], [x3] + st1 {v0.h}[4], [x16] + add x3, x3, #2*5 + ret +40: + AARCH64_VALID_CALL_TARGET + st1 {v0.16b}, [x3] + str s1, [x3, #16] + add x3, x3, #4*5 + ret +80: + AARCH64_VALID_CALL_TARGET + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write the last few, overlapping with the first write. + stur q2, [x3, #(8*5-16)] + add x3, x3, #8*5 + ret +160: + AARCH64_VALID_CALL_TARGET + add x16, x3, #6*5 + add x17, x3, #12*5 + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write another 6 full entries, slightly overlapping with the first set + st1 {v0.16b, v1.16b}, [x16] + // Write 8 bytes (one full entry) after the first 12 + st1 {v0.8b}, [x17] + // Write the last 3 entries + str q2, [x3, #(16*5-16)] + add x3, x3, #16*5 + ret + +L(save_tmvs_tbl): + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b +endfunc |