summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/looprestoration.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/64/looprestoration.S')
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration.S1303
1 files changed, 1303 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
new file mode 100644
index 0000000000..f8dc0df4d8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -0,0 +1,1303 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter7_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_8bpc_neon
+ bl wiener_filter7_hv_8bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_8bpc_neon
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_8bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_8bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ add v24.8h, v24.8h, v20.8h
+ ld1 {v26.8h, v27.8h}, [x14], #32
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v28.8h, v26.8h, v18.8h
+ ld1 {v22.8h, v23.8h}, [x12], #32
+
+ add v16.8h, v26.8h, v16.8h
+ add v25.8h, v25.8h, v21.8h
+
+ smull v2.4s, v22.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v28.4h, v1.h[5]
+ smlal v2.4s, v16.4h, v1.h[6]
+ add v29.8h, v27.8h, v19.8h
+ smull2 v3.4s, v22.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v28.8h, v1.h[5]
+ smlal2 v3.4s, v16.8h, v1.h[6]
+ add v17.8h, v27.8h, v17.8h
+ smull v4.4s, v23.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v29.4h, v1.h[5]
+ smlal v4.4s, v17.4h, v1.h[6]
+ smull2 v5.4s, v23.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v29.8h, v1.h[5]
+ smlal2 v5.4s, v17.8h, v1.h[6]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ ld1 {v20.8h, v21.8h}, [x11], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v26.8h, v27.8h}, [x13], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v28.8h, v29.8h}, [x14], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v26.8h, v20.8h, v26.8h
+
+ ld1 {v24.8h, v25.8h}, [x12], #32
+ add v28.8h, v18.8h, v28.8h
+
+ add v16.8h, v16.8h, v6.8h
+ add v27.8h, v21.8h, v27.8h
+
+ smull v18.4s, v24.4h, v1.h[3]
+ smlal v18.4s, v26.4h, v1.h[4]
+ smlal v18.4s, v28.4h, v1.h[5]
+ smlal v18.4s, v16.4h, v1.h[6]
+ add v29.8h, v19.8h, v29.8h
+ smull2 v19.4s, v24.8h, v1.h[3]
+ smlal2 v19.4s, v26.8h, v1.h[4]
+ smlal2 v19.4s, v28.8h, v1.h[5]
+ smlal2 v19.4s, v16.8h, v1.h[6]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v25.4h, v1.h[3]
+ smlal v20.4s, v27.4h, v1.h[4]
+ smlal v20.4s, v29.4h, v1.h[5]
+ smlal v20.4s, v17.4h, v1.h[6]
+ smull2 v21.4s, v25.8h, v1.h[3]
+ smlal2 v21.4s, v27.8h, v1.h[4]
+ smlal2 v21.4s, v29.8h, v1.h[5]
+ smlal2 v21.4s, v17.8h, v1.h[6]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter5_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_8bpc_neon
+ bl wiener_filter5_hv_8bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_8bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_8bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_8bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ ld1 {v16.8h, v17.8h}, [x11], #32
+
+ add v24.8h, v22.8h, v18.8h
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v16.8h, v22.8h, v16.8h
+ add v25.8h, v23.8h, v19.8h
+
+ smull v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v16.4h, v1.h[5]
+ add v17.8h, v23.8h, v17.8h
+ smull2 v3.4s, v20.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v16.8h, v1.h[5]
+ smull v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v17.4h, v1.h[5]
+ smull2 v5.4s, v21.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v17.8h, v1.h[5]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 2x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ ld1 {v18.8h, v19.8h}, [x12], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v24.8h, v25.8h}, [x14], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ add v24.8h, v24.8h, v18.8h
+ add v16.8h, v16.8h, v6.8h
+
+ smull v18.4s, v20.4h, v1.h[3]
+ smlal v18.4s, v24.4h, v1.h[4]
+ smlal v18.4s, v16.4h, v1.h[5]
+ add v25.8h, v25.8h, v19.8h
+ smull2 v19.4s, v20.8h, v1.h[3]
+ smlal2 v19.4s, v24.8h, v1.h[4]
+ smlal2 v19.4s, v16.8h, v1.h[5]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v21.4h, v1.h[3]
+ smlal v20.4s, v25.4h, v1.h[4]
+ smlal v20.4s, v17.4h, v1.h[5]
+ smull2 v21.4s, v21.8h, v1.h[3]
+ smlal2 v21.4s, v25.8h, v1.h[4]
+ smlal2 v21.4s, v17.8h, v1.h[5]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box3_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #2
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2]
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 2 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ uaddl v3.8h, v0.8b, v16.8b
+ ext v20.16b, v1.16b, v2.16b, #2
+ uaddw v3.8h, v3.8h, v17.8b
+
+ ext v21.16b, v1.16b, v2.16b, #4
+
+ uaddl v26.4s, v1.4h, v20.4h
+ uaddl2 v27.4s, v1.8h, v20.8h
+ uaddw v26.4s, v26.4s, v21.4h
+ uaddw2 v27.4s, v27.4s, v21.8h
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box5_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #3
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 3 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v0.16b, v0.16b, #3
+ ext v19.16b, v0.16b, v0.16b, #4
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddl v24.8h, v17.8b, v18.8b
+ uaddw v3.8h, v3.8h, v19.8b
+ add v3.8h, v3.8h, v24.8h
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v18.16b, v1.16b, v2.16b, #6
+ ext v19.16b, v1.16b, v2.16b, #8
+
+ uaddl v26.4s, v1.4h, v16.4h
+ uaddl2 v27.4s, v1.8h, v16.8h
+ uaddl v16.4s, v17.4h, v18.4h
+ uaddl2 v17.4s, v17.8h, v18.8h
+ uaddw v26.4s, v26.4s, v19.4h
+ uaddw2 v27.4s, v27.4s, v19.8h
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
+// int32_t *sumsq5, int16_t *sum5,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box35_row_h_8bpc_neon, export=1
+ add w6, w6, #2 // w += 2
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x4, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x5, x5, #3
+ ld1 {v0.16b}, [x5], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x5], #16
+ ld1 {v1.s}[3], [x4], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x5], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w6, #(2 + 16 - 3 + 1)
+ ldr b30, [x5, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w6, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w6, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v19.16b, v0.16b, v0.16b, #4
+ ext v18.16b, v0.16b, v0.16b, #3
+ uaddl v3.8h, v16.8b, v17.8b
+ uaddl v24.8h, v0.8b, v19.8b
+ uaddw v3.8h, v3.8h, v18.8b
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v19.16b, v1.16b, v2.16b, #8
+ ext v18.16b, v1.16b, v2.16b, #6
+
+ st1 {v3.8h}, [x1], #16
+ add v3.8h, v3.8h, v24.8h
+
+ uaddl v26.4s, v16.4h, v17.4h
+ uaddl2 v27.4s, v16.8h, v17.8h
+ uaddl v16.4s, v1.4h, v19.4h
+ uaddl2 v17.4s, v1.8h, v19.8h
+ uaddw v26.4s, v26.4s, v18.4h
+ uaddw2 v27.4s, v27.4s, v18.8h
+
+ st1 {v26.4s,v27.4s}, [x0], #32
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w6, w6, #8
+
+ st1 {v3.8h}, [x3], #16
+ st1 {v26.4s,v27.4s}, [x2], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x5], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+sgr_funcs 8