1 files changed, 801 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S
new file mode 100644
index 0000000000..d699617a87
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                       const pixel *src, ptrdiff_t stride,
+//                                       const int16_t fh[7], const intptr_t w,
+//                                       int h, enum LrEdgeFlags edges,
+//                                       const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        ldr             r8,       [sp, #116] // bitdepth_max
+        vld1.16         {q0}, [r4, :128]
+        clz             r8,  r8
+        vmov.i32        q14, #1
+        sub             r9,  r8,  #38  // -(bitdepth + 6)
+        sub             r8,  r8,  #25  // -round_bits_h
+        neg             r9,  r9        // bitdepth + 6
+        vdup.32         q1,  r9
+        vdup.32         q13, r8        // -round_bits_h
+        vmov.i16        q15, #8192
+        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
+        mov             r8,  r5
+        // Calculate mid_stride
+        add             r10, r5,  #7
+        bic             r10, r10, #7
+        lsl             r10, r10, #1
+
+        // Set up pointers for reading/writing alternate rows
+        add             r12, r0,  r10
+        lsl             r10, r10, #1
+        add             lr,  r2,  r3
+        lsl             r3,  r3,  #1
+
+        // Subtract the aligned width from mid_stride
+        add             r11, r5,  #7
+        bic             r11, r11, #7
+        sub             r10, r10, r11, lsl #1
+
+        // Subtract the number of pixels read from the source stride
+        add             r11, r11, #8
+        sub             r3,  r3,  r11, lsl #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r1,  #0
+        bne             0f
+        // left == NULL
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r3,  r3,  #6
+
+
+1:      // Loop vertically
+        vld1.16         {q2, q3}, [r2]!
+        vld1.16         {q4, q5}, [lr]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r1,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d3},  [r1]!
+        // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vld1.16         {d13}, [r1]!
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+        // and shift q2/q3 to have 3x the first pixel at the front.
+        vdup.16         q1,  d4[0]
+        vdup.16         q6,  d8[0]
+        // Move r2 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+
+2:
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             r9,  r5,  #14
+        lsl             r9,  r9,  #1
+        ldrh            r11, [r2, r9]
+        ldrh            r9,  [lr, r9]
+        // Fill q11/q12 with the right padding pixel
+        vdup.16         q11, r11
+        vdup.16         q12, r9
+3:      // !LR_HAVE_RIGHT
+
+        // Check whether we need to pad the right edge
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+
+        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+        // buffer pointer.
+        movrel_local    r4,  right_ext_mask, -6
+        sub             r4,  r4,  r5,  lsl #1
+        vld1.8          {q9, q10}, [r4]
+
+        vbit            q2,  q11, q9
+        vbit            q3,  q11, q10
+        vbit            q4,  q12, q9
+        vbit            q5,  q12, q10
+
+4:      // Loop horizontally
+        vext.8          q7,  q2,  q3,  #4
+        vext.8          q8,  q2,  q3,  #8
+        vext.8          q6,  q2,  q3,  #2
+        vext.8          q9,  q2,  q3,  #10
+        vadd.i16        q8,  q8,  q7
+        vadd.i16        q9,  q9,  q6
+        vext.8          q6,  q2,  q3,  #12
+        vext.8          q7,  q2,  q3,  #6
+        vadd.i16        q2,  q2,  q6
+        vmull.s16       q6,  d14, d0[3]
+        vmlal.s16       q6,  d16, d1[0]
+        vmlal.s16       q6,  d18, d1[1]
+        vmlal.s16       q6,  d4,  d1[2]
+        vmull.s16       q7,  d15, d0[3]
+        vmlal.s16       q7,  d17, d1[0]
+        vmlal.s16       q7,  d19, d1[1]
+        vmlal.s16       q7,  d5,  d1[2]
+
+        vext.8          q8,  q4,  q5,  #4
+        vext.8          q10, q4,  q5,  #8
+        vext.8          q9,  q4,  q5,  #2
+        vext.8          q2,  q4,  q5,  #10
+        vadd.i16        q10, q10, q8
+        vadd.i16        q2,  q2,  q9
+        vext.8          q8,  q4,  q5,  #12
+        vext.8          q9,  q4,  q5,  #6
+        vadd.i16        q4,  q4,  q8
+        vmull.s16       q8,  d18, d0[3]
+        vmlal.s16       q8,  d20, d1[0]
+        vmlal.s16       q8,  d4,  d1[1]
+        vmlal.s16       q8,  d8,  d1[2]
+        vmull.s16       q9,  d19, d0[3]
+        vmlal.s16       q9,  d21, d1[0]
+        vmlal.s16       q9,  d5,  d1[1]
+        vmlal.s16       q9,  d9,  d1[2]
+
+        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q14
+        vadd.i32        q8,  q8,  q14
+        vadd.i32        q9,  q9,  q14
+        vrshl.s32       q6,  q6,  q13
+        vrshl.s32       q7,  q7,  q13
+        vrshl.s32       q8,  q8,  q13
+        vrshl.s32       q9,  q9,  q13
+        vqmovun.s32     d12, q6
+        vqmovun.s32     d13, q7
+        vqmovun.s32     d14, q8
+        vqmovun.s32     d15, q9
+        vmin.u16        q6,  q6,  q10
+        vmin.u16        q7,  q7,  q10
+        vsub.i16        q6,  q6,  q15
+        vsub.i16        q7,  q7,  q15
+        subs            r5,  r5,  #8
+        vst1.16         {q6}, [r0,  :128]!
+        vst1.16         {q7}, [r12, :128]!
+
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q2,  q3
+        vmov            q4,  q5
+        vld1.16         {q3}, [r2]!
+        vld1.16         {q5}, [lr]!
+        bne             4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r10
+        add             r12, r12, r10
+        add             r2,  r2,  r3
+        add             lr,  lr,  r3
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                       const int16_t *mid, int w, int h,
+//                                       const int16_t fv[7], enum LrEdgeFlags edges,
+//                                       ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+        push            {r4-r7,lr}
+        vpush           {q4-q5}
+        ldrd            r4,  r5,  [sp, #52]
+        ldrd            r6,  r7,  [sp, #60]
+        ldr             lr,       [sp, #68] // bitdepth_max
+        vld1.16         {q0},  [r5, :128]
+        vdup.16         q5,  lr
+        clz             lr,  lr
+        sub             lr,  lr,  #11   // round_bits_v
+        vdup.32         q4,  lr
+        mov             lr,  r4
+        vneg.s32        q4,  q4         // -round_bits_v
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             r12, r4
+        tst             r6,  #4 // LR_HAVE_TOP
+        beq             0f
+        sub             r2,  r2,  r7, lsl #1
+        add             r12, r12, #2
+0:
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        add             r12, r12, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into q8-q11 and pad properly.
+        tst             r6,  #4 // LR_HAVE_TOP
+        vld1.16         {q8},  [r2, :128], r7
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.16         {q10}, [r2, :128], r7
+        vmov            q9,  q8
+        vld1.16         {q11}, [r2, :128], r7
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+
+3:
+        cmp             r4,  #4
+        blt             5f
+        // Start filtering normally; fill in q12-q14 with unique rows.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vld1.16         {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+        subs            r4,  r4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vrshl.s32       q2,  q2,  q4    // round_bits_v
+        vrshl.s32       q3,  q3,  q4
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q5    // bitdepth_max
+        vst1.16         {q2}, [r0, :128], r1
+.if \compare
+        cmp             r4,  #4
+.else
+        ble             9f
+.endif
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            q10, q11
+        vmov            q11, q12
+        vmov            q12, q13
+        vmov            q13, q14
+.endm
+        filter          1
+        blt             7f
+        vld1.16         {q14}, [r2, :128], r7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             6f
+        // LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        // We load at least 2 rows in all cases.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        bgt             53f // 3 rows in total
+        beq             52f // 2 rows in total
+51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
+        vmov            q13, q12
+        b               8f
+52:     // 2 rows in total, q11 already loaded, load q12 with content data
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vmov            q15, q14
+        b               8f
+53:
+        // 3 rows in total, q11 already loaded, load q12 and q13 with content
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        bgt             63f // 3 rows in total
+        beq             62f // 2 rows in total
+61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
+        vmov            q12, q11
+        vmov            q13, q11
+        vmov            q14, q11
+        b               8f
+62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+        vld1.16         {q12}, [r2, :128], r7
+        vmov            q13, q12
+        vmov            q14, q12
+        vmov            q15, q12
+        b               8f
+63:
+        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+        b               8f
+
+7:
+        // All registers up to q13 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+
+8:      // At this point, all registers up to q14-q15,q1 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        vmov            q14, q15
+        vmov            q15, q1
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            r3,  r3,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        mls             r0,  r1,  lr,  r0
+        mls             r2,  r7,  r12, r2
+        add             r0,  r0,  #16
+        add             r2,  r2,  #16
+        mov             r4,  lr
+        b               1b
+
+0:
+        vpop            {q4-q5}
+        pop             {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+        sub             r9,  r9,  lr, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             lr,  lr,  #8
+        sub             r4,  r4,  lr, lsl #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #4
+
+
+1:      // Loop vertically
+        vld1.16         {q0, q1}, [r3]!
+        vld1.16         {q4, q5}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d5}, [r2]!
+        // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        vld1.16         {d13}, [r2]!
+        vext.8          q1,  q0,  q1,  #12
+        vext.8          q0,  q2,  q0,  #12
+        vext.8          q5,  q4,  q5,  #12
+        vext.8          q4,  q6,  q4,  #12
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+        // and shift q0 to have 2x the first byte at the front.
+        vdup.16         q2,  d0[0]
+        vdup.16         q6,  d8[0]
+        // Move r3 back to account for the last 2 pixels we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        vext.8          q1,  q0,  q1,  #12
+        vext.8          q0,  q2,  q0,  #12
+        vext.8          q5,  q4,  q5,  #12
+        vext.8          q4,  q6,  q4,  #12
+
+2:
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5,  #(2 + 16 - 2 + 1)
+        lsl             lr,  lr,  #1
+        ldrh            r11, [r3,  lr]
+        ldrh            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.16         q14, r11
+        vdup.16         q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+
+        // Check whether we need to pad the right edge
+        cmp             r5,  #10
+        bge             4f   // If w >= 10, all used input pixels are valid
+
+        // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+        // again; it's not strictly needed in those cases (we pad enough here),
+        // but keeping the code as simple as possible.
+
+        // Insert padding in q0/1.h[w] onwards
+        movrel_local    lr,  right_ext_mask
+        sub             lr,  lr,  r5,  lsl #1
+        vld1.8          {q12, q13}, [lr]
+
+        vbit            q0,  q14, q12
+        vbit            q1,  q14, q13
+        vbit            q4,  q15, q12
+        vbit            q5,  q15, q13
+
+4:      // Loop horizontally
+        vext.8          q8,  q0,  q1,  #2
+        vext.8          q10, q4,  q5,  #2
+        vext.8          q9,  q0,  q1,  #4
+        vext.8          q11, q4,  q5,  #4
+        vadd.i16        q2,  q0,  q8
+        vadd.i16        q3,  q4,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+
+        vmull.u16       q6,  d0,  d0
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d18, d18
+        vmull.u16       q12, d8,  d8
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d22, d22
+        vmull.u16       q7,  d1,  d1
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmull.u16       q13, d9,  d9
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+        subs            r5,  r5,  #8
+        vst1.16         {q2},       [r1,  :128]!
+        vst1.16         {q3},       [r11, :128]!
+        vst1.32         {q6,  q7},  [r0,  :128]!
+        vst1.32         {q12, q13}, [r10, :128]!
+
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q0,  q1
+        vmov            q4,  q5
+        vld1.16         {q1}, [r3]!
+        vld1.16         {q5}, [r12]!
+
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+        sub             r9,  r9,  lr, lsl #1
+        add             lr,  lr,  #8
+        sub             r4,  r4,  lr, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #6
+
+1:      // Loop vertically
+        vld1.16         {q0, q1}, [r3]!
+        vld1.16         {q4, q5}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d5}, [r2]!
+        // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        vld1.16         {d13}, [r2]!
+        vext.8          q1,  q0,  q1,  #10
+        vext.8          q0,  q2,  q0,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+        // and shift q0 to have 3x the first pixel at the front.
+        vdup.16         q2,  d0[0]
+        vdup.16         q6,  d8[0]
+        // Move r3 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        vext.8          q1,  q0,  q1,  #10
+        vext.8          q0,  q2,  q0,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+
+2:
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5,  #(2 + 16 - 3 + 1)
+        lsl             lr,  lr,  #1
+        ldrh            r11, [r3,  lr]
+        ldrh            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.16         q14, r11
+        vdup.16         q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+
+        // Check whether we need to pad the right edge
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+
+        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+        // buffer pointer.
+        movrel_local    lr,  right_ext_mask, -2
+        sub             lr,  lr,  r5,  lsl #1
+        vld1.8          {q12, q13}, [lr]
+
+        vbit            q0,  q14, q12
+        vbit            q1,  q14, q13
+        vbit            q4,  q15, q12
+        vbit            q5,  q15, q13
+
+4:      // Loop horizontally
+        vext.8          q8,  q0,  q1,  #2
+        vext.8          q10, q4,  q5,  #2
+        vext.8          q9,  q0,  q1,  #4
+        vext.8          q11, q4,  q5,  #4
+        vadd.i16        q2,  q0,  q8
+        vadd.i16        q3,  q4,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+
+        vmull.u16       q6,  d0,  d0
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d18, d18
+        vmull.u16       q12, d8,  d8
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d22, d22
+        vmull.u16       q7,  d1,  d1
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmull.u16       q13, d9,  d9
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+
+        vext.8          q8,  q0,  q1,  #6
+        vext.8          q10, q4,  q5,  #6
+        vext.8          q9,  q0,  q1,  #8
+        vext.8          q11, q4,  q5,  #8
+        vadd.i16        q2,  q2,  q8
+        vadd.i16        q3,  q3,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d1,  d1
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d9,  d9
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+
+        subs            r5,  r5,  #8
+        vst1.16         {q2},       [r1,  :128]!
+        vst1.16         {q3},       [r11, :128]!
+        vst1.32         {q6,  q7},  [r0,  :128]!
+        vst1.32         {q12, q13}, [r10, :128]!
+
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q0,  q1
+        vmov            q4,  q5
+        vld1.16         {q1}, [r3]!
+        vld1.16         {q5}, [r12]!
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+sgr_funcs 16