summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/refmvs.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/64/refmvs.S')
-rw-r--r--third_party/dav1d/src/arm/64/refmvs.S292
1 files changed, 292 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S
new file mode 100644
index 0000000000..e905682f47
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ ld1 {v3.16b}, [x1]
+ clz w3, w3
+ adr x5, L(splat_tbl)
+ sub w3, w3, #26
+ ext v2.16b, v3.16b, v3.16b, #12
+ ldrh w3, [x5, w3, uxtw #1]
+ add w2, w2, w2, lsl #1
+ ext v0.16b, v2.16b, v3.16b, #4
+ sub x3, x5, w3, uxtw
+ ext v1.16b, v2.16b, v3.16b, #8
+ lsl w2, w2, #2
+ ext v2.16b, v2.16b, v3.16b, #12
+1:
+ ldr x1, [x0], #8
+ subs w4, w4, #1
+ add x1, x1, x2
+ br x3
+
+10:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x1]
+ str s2, [x1, #8]
+ b.gt 1b
+ ret
+20:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x1]
+ str d1, [x1, #16]
+ b.gt 1b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+160:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+80:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+40:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1]
+ b.gt 1b
+ ret
+
+L(splat_tbl):
+ .hword L(splat_tbl) - 320b
+ .hword L(splat_tbl) - 160b
+ .hword L(splat_tbl) - 80b
+ .hword L(splat_tbl) - 40b
+ .hword L(splat_tbl) - 20b
+ .hword L(splat_tbl) - 10b
+endfunc
+
+const mv_tbls, align=4
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+ .byte 1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+// refmvs_block **rr, const uint8_t *ref_sign,
+// int col_end8, int row_end8,
+// int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ movi v30.8b, #0
+ ld1 {v31.8b}, [x3]
+ adr x8, L(save_tmvs_tbl)
+ movrel x16, mask_mult
+ movrel x13, mv_tbls
+ ld1 {v29.8b}, [x16]
+ ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign]
+ mov w15, #5
+ mov w14, #12*2
+ sxtw x4, w4
+ sxtw x6, w6
+ mul w1, w1, w15 // stride *= 5
+ sub w5, w5, w7 // h = row_end8 - row_start8
+ lsl w7, w7, #1 // row_start8 <<= 1
+1:
+ mov w15, #5
+ and w9, w7, #30 // (y & 15) * 2
+ ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
+ add x9, x9, #12 // &b[... + 1]
+ madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
+ madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
+
+ madd x3, x6, x15, x0 // &rp[x]
+
+2:
+ ldrb w11, [x9, #10] // cand_b->bs
+ ld1 {v0.16b}, [x9] // cand_b->mv
+ add x11, x8, w11, uxtw #2
+ ldr h1, [x9, #8] // cand_b->ref
+ ldrh w12, [x11] // bw8
+ mov x15, x8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ cmp x9, x10
+ mov v2.8b, v0.8b
+ b.ge 3f
+
+ ldrb w15, [x9, #10] // cand_b->bs
+ add x16, x9, #8
+ ld1 {v4.16b}, [x9] // cand_b->mv
+ add x15, x8, w15, uxtw #2
+ ld1 {v1.h}[1], [x16] // cand_b->ref
+ ldrh w12, [x15] // bw8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ trn1 v2.2d, v0.2d, v4.2d
+
+3:
+ abs v2.8h, v2.8h // abs(mv[].xy)
+ tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref]
+ ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12
+ umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2}
+ cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096
+ xtn v2.4h, v2.4s // abs() condition to 16 bit
+ and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1]
+ addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
+ umov w16, v1.h[0] // Extract case for first block
+ umov w17, v1.h[1]
+ ldrh w11, [x11, #2] // Fetch jump table entry
+ ldrh w15, [x15, #2]
+ ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
+ ldr q5, [x13, w17, uxtw #4]
+ sub x11, x8, w11, uxtw // Find jump table target
+ sub x15, x8, w15, uxtw
+ tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
+ tbl v4.16b, {v4.16b}, v5.16b
+
+ // v1 follows on v0, with another 3 full repetitions of the pattern.
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v5.16b, v4.16b, v4.16b, #1
+ // v2 ends with 3 complete repetitions of the pattern.
+ ext v2.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v4.16b, v5.16b, #4
+
+ blr x11
+ b.ge 4f // if (cand_b >= end)
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ cmp x9, x10
+ blr x15
+ b.lt 2b // if (cand_b < end)
+
+4:
+ subs w5, w5, #1 // h--
+ add w7, w7, #2 // y += 2
+ add x0, x0, x1 // rp += stride
+ b.gt 1b
+
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+10:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #4
+ st1 {v0.s}[0], [x3]
+ st1 {v0.b}[4], [x16]
+ add x3, x3, #5
+ ret
+20:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #8
+ st1 {v0.d}[0], [x3]
+ st1 {v0.h}[4], [x16]
+ add x3, x3, #2*5
+ ret
+40:
+ AARCH64_VALID_CALL_TARGET
+ st1 {v0.16b}, [x3]
+ str s1, [x3, #16]
+ add x3, x3, #4*5
+ ret
+80:
+ AARCH64_VALID_CALL_TARGET
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write the last few, overlapping with the first write.
+ stur q2, [x3, #(8*5-16)]
+ add x3, x3, #8*5
+ ret
+160:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #6*5
+ add x17, x3, #12*5
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write another 6 full entries, slightly overlapping with the first set
+ st1 {v0.16b, v1.16b}, [x16]
+ // Write 8 bytes (one full entry) after the first 12
+ st1 {v0.8b}, [x17]
+ // Write the last 3 entries
+ str q2, [x3, #(16*5-16)]
+ add x3, x3, #16*5
+ ret
+
+L(save_tmvs_tbl):
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+endfunc