summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/looprestoration16.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/64/looprestoration16.S')
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration16.S1419
1 files changed, 1419 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
new file mode 100644
index 0000000000..8954e604cf
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -0,0 +1,1419 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter7_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_16bpc_neon
+ bl wiener_filter7_hv_16bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_16bpc_neon
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += p_stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_16bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_16bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v16.4s, v18.4h, v0.h[3]
+ smlal v16.4s, v19.4h, v0.h[2]
+ smlal v16.4s, v20.4h, v0.h[1]
+ smlal v16.4s, v21.4h, v0.h[0]
+ smull2 v17.4s, v18.8h, v0.h[3]
+ smlal2 v17.4s, v19.8h, v0.h[2]
+ smlal2 v17.4s, v20.8h, v0.h[1]
+ smlal2 v17.4s, v21.8h, v0.h[0]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ ld1 {v6.8h, v7.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[4]
+ smlal v2.4s, v18.4h, v0.h[5]
+ smlal v2.4s, v20.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[7]
+ smlal v2.4s, v24.4h, v0.h[6]
+ smlal v2.4s, v6.4h, v0.h[5]
+ smlal v2.4s, v6.4h, v0.h[4]
+ smull2 v3.4s, v16.8h, v0.h[4]
+ smlal2 v3.4s, v18.8h, v0.h[5]
+ smlal2 v3.4s, v20.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[7]
+ smlal2 v3.4s, v24.8h, v0.h[6]
+ smlal2 v3.4s, v6.8h, v0.h[5]
+ smlal2 v3.4s, v6.8h, v0.h[4]
+ smull v4.4s, v17.4h, v0.h[4]
+ smlal v4.4s, v19.4h, v0.h[5]
+ smlal v4.4s, v21.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[7]
+ smlal v4.4s, v25.4h, v0.h[6]
+ smlal v4.4s, v7.4h, v0.h[5]
+ smlal v4.4s, v7.4h, v0.h[4]
+ smull2 v5.4s, v17.8h, v0.h[4]
+ smlal2 v5.4s, v19.8h, v0.h[5]
+ smlal2 v5.4s, v21.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[7]
+ smlal2 v5.4s, v25.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ smlal2 v5.4s, v7.8h, v0.h[4]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v24.4s, v18.4h, v0.h[3]
+ smlal v24.4s, v19.4h, v0.h[2]
+ smlal v24.4s, v20.4h, v0.h[1]
+ smlal v24.4s, v21.4h, v0.h[0]
+ smull2 v25.4s, v18.8h, v0.h[3]
+ smlal2 v25.4s, v19.8h, v0.h[2]
+ smlal2 v25.4s, v20.8h, v0.h[1]
+ smlal2 v25.4s, v21.8h, v0.h[0]
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ ld1 {v8.8h, v9.8h}, [x14], #32
+
+ smull v1.4s, v16.4h, v0.h[4]
+ smlal v1.4s, v18.4h, v0.h[5]
+ smlal v1.4s, v20.4h, v0.h[6]
+ smlal v1.4s, v22.4h, v0.h[7]
+ smlal v1.4s, v24.4h, v0.h[6]
+ smlal v1.4s, v8.4h, v0.h[5]
+ smlal v1.4s, v6.4h, v0.h[4]
+ smull2 v5.4s, v16.8h, v0.h[4]
+ smlal2 v5.4s, v18.8h, v0.h[5]
+ smlal2 v5.4s, v20.8h, v0.h[6]
+ smlal2 v5.4s, v22.8h, v0.h[7]
+ smlal2 v5.4s, v24.8h, v0.h[6]
+ smlal2 v5.4s, v8.8h, v0.h[5]
+ smlal2 v5.4s, v6.8h, v0.h[4]
+ smull v26.4s, v17.4h, v0.h[4]
+ smlal v26.4s, v19.4h, v0.h[5]
+ smlal v26.4s, v21.4h, v0.h[6]
+ smlal v26.4s, v23.4h, v0.h[7]
+ smlal v26.4s, v25.4h, v0.h[6]
+ smlal v26.4s, v9.4h, v0.h[5]
+ smlal v26.4s, v7.4h, v0.h[4]
+ smull2 v16.4s, v17.8h, v0.h[4]
+ smlal2 v16.4s, v19.8h, v0.h[5]
+ smlal2 v16.4s, v21.8h, v0.h[6]
+ smlal2 v16.4s, v23.8h, v0.h[7]
+ smlal2 v16.4s, v25.8h, v0.h[6]
+ smlal2 v16.4s, v9.8h, v0.h[5]
+ smlal2 v16.4s, v7.8h, v0.h[4]
+ srshl v1.4s, v1.4s, v27.4s // -round_bits_v
+ srshl v5.4s, v5.4s, v27.4s
+ srshl v26.4s, v26.4s, v27.4s
+ srshl v16.4s, v16.4s, v27.4s
+ sqxtun v18.4h, v1.4s
+ sqxtun2 v18.8h, v5.4s
+ sqxtun v19.4h, v26.4s
+ sqxtun2 v19.8h, v16.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v18.8h, v18.8h, v28.8h // bitdepth_max
+ umin v19.8h, v19.8h, v28.8h
+ subs w4, w4, #16
+
+ st1 {v18.8h, v19.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter5_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_16bpc_neon
+ bl wiener_filter5_hv_16bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_16bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_16bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v16.4s, v17.4h, v0.h[3]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[1]
+ smull2 v17.4s, v17.8h, v0.h[3]
+ smlal2 v17.4s, v18.8h, v0.h[2]
+ smlal2 v17.4s, v19.8h, v0.h[1]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_16bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[5]
+ smlal v2.4s, v18.4h, v0.h[6]
+ smlal v2.4s, v20.4h, v0.h[7]
+ smlal v2.4s, v22.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[5]
+ smull2 v3.4s, v16.8h, v0.h[5]
+ smlal2 v3.4s, v18.8h, v0.h[6]
+ smlal2 v3.4s, v20.8h, v0.h[7]
+ smlal2 v3.4s, v22.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[5]
+ smull v4.4s, v17.4h, v0.h[5]
+ smlal v4.4s, v19.4h, v0.h[6]
+ smlal v4.4s, v21.4h, v0.h[7]
+ smlal v4.4s, v23.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[5]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 2x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v24.4s, v17.4h, v0.h[3]
+ smlal v24.4s, v18.4h, v0.h[2]
+ smlal v24.4s, v19.4h, v0.h[1]
+ smull2 v25.4s, v17.8h, v0.h[3]
+ smlal2 v25.4s, v18.8h, v0.h[2]
+ smlal2 v25.4s, v19.8h, v0.h[1]
+
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ smull v8.4s, v16.4h, v0.h[5]
+ smlal v8.4s, v18.4h, v0.h[6]
+ smlal v8.4s, v20.4h, v0.h[7]
+ smlal v8.4s, v22.4h, v0.h[6]
+ smlal v8.4s, v6.4h, v0.h[5]
+ smull2 v9.4s, v16.8h, v0.h[5]
+ smlal2 v9.4s, v18.8h, v0.h[6]
+ smlal2 v9.4s, v20.8h, v0.h[7]
+ smlal2 v9.4s, v22.8h, v0.h[6]
+ smlal2 v9.4s, v6.8h, v0.h[5]
+ smull v1.4s, v17.4h, v0.h[5]
+ smlal v1.4s, v19.4h, v0.h[6]
+ smlal v1.4s, v21.4h, v0.h[7]
+ smlal v1.4s, v23.4h, v0.h[6]
+ smlal v1.4s, v7.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ srshl v8.4s, v8.4s, v27.4s // -round_bits_v
+ srshl v9.4s, v9.4s, v27.4s
+ srshl v1.4s, v1.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v8.4h, v8.4s
+ sqxtun2 v8.8h, v9.4s
+ sqxtun v9.4h, v1.4s
+ sqxtun2 v9.8h, v5.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v8.8h, v8.8h, v28.8h // bitdepth_max
+ umin v9.8h, v9.8h, v28.8h
+
+ subs w4, w4, #16
+
+ st1 {v8.8h, v9.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Subtract the number of pixels read from the input from the stride
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ sub x12, x12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #4
+
+
+1: // Loop vertically
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x12], #32
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ sub x12, x12, #4
+ ld1 {v18.d}[1], [x2], #8
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ ext v17.16b, v16.16b, v17.16b, #12
+ ext v16.16b, v18.16b, v16.16b, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 2x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ dup v18.8h, v16.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ sub x12, x12, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ ext v17.16b, v16.16b, v17.16b, #12
+ ext v16.16b, v18.16b, v16.16b, #12
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 2 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ ldr h31, [x12, w13, sxtw #1]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8h, v30.h[0]
+ dup v31.8h, v31.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0/1.h[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w5, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+ bit v16.16b, v31.16b, v28.16b
+ bit v17.16b, v31.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v16.8h, v28.8h
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ subs w5, w5, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+ st1 {v24.4s,v25.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ mov v16.16b, v17.16b
+ ld1 {v1.8h}, [x3], #16
+ ld1 {v17.8h}, [x12], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ sub x12, x12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #6
+
+1: // Loop vertically
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x12], #32
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ sub x12, x12, #6
+ ld1 {v18.d}[1], [x2], #8
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ ext v17.16b, v16.16b, v17.16b, #10
+ ext v16.16b, v18.16b, v16.16b, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ dup v18.8h, v16.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ sub x12, x12, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ ext v17.16b, v16.16b, v17.16b, #10
+ ext v16.16b, v18.16b, v16.16b, #10
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 3 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ ldr h31, [x12, w13, sxtw #1]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8h, v30.h[0]
+ dup v31.8h, v31.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -2
+ sub x13, x13, w5, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+ bit v16.16b, v31.16b, v28.16b
+ bit v17.16b, v31.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v16.8h, v28.8h
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ ext v26.16b, v0.16b, v1.16b, #6
+ ext v28.16b, v16.16b, v17.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
+ ext v29.16b, v16.16b, v17.16b, #8
+
+ add v6.8h, v6.8h, v26.8h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v7.8h, v28.8h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ subs w5, w5, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+ st1 {v24.4s,v25.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ mov v16.16b, v17.16b
+ ld1 {v1.8h}, [x3], #16
+ ld1 {v17.8h}, [x12], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+sgr_funcs 16