Adding upstream version 86.0.1.upstream/86.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
commit: 2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree: b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/dav1d/src/arm/64/looprestoration16.S
parent: Initial commit. (diff)
download: firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
1 files changed, 1239 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
new file mode 100644
index 0000000000..437988cfac
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -0,0 +1,1239 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                       const pixel *src, ptrdiff_t stride,
+//                                       const int16_t fh[7], const intptr_t w,
+//                                       int h, enum LrEdgeFlags edges,
+//                                       const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+        ldr             w8,  [sp]      // bitdepth_max
+        ld1             {v0.8h},  [x4]
+        clz             w8,  w8
+        movi            v30.4s,  #1
+        sub             w9,  w8,  #38  // -(bitdepth + 6)
+        sub             w8,  w8,  #25  // -round_bits_h
+        neg             w9,  w9        // bitdepth + 6
+        dup             v1.4s,   w9
+        dup             v29.4s,  w8    // -round_bits_h
+        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
+        ushl            v30.4s,  v30.4s,  v1.4s // 1 << (bitdepth + 6)
+        mov             w8,  w5
+        // Calculate mid_stride
+        add             w10, w5,  #7
+        bic             w10, w10, #7
+        lsl             w10, w10, #1
+
+        // Clear the last unused element of v0, to allow filtering a single
+        // pixel with one plain mul+addv.
+        ins             v0.h[7], wzr
+
+        // Set up pointers for reading/writing alternate rows
+        add             x12, x0,  x10
+        lsl             w10, w10, #1
+        add             x13, x2,  x3
+        lsl             x3,  x3,  #1
+
+        // Subtract the width from mid_stride
+        sub             x10, x10, w5, uxtw #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             w5,  #8
+        add             w11, w5,  #13
+        bic             w11, w11, #7
+        b.ge            1f
+        mov             w11, #16
+1:
+        sub             x3,  x3,  w11, uxtw #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x1,  0f
+        // left == NULL
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x3,  x3,  #6
+
+
+1:      // Loop vertically
+        ld1             {v2.8h, v3.8h},  [x2],  #32
+        ld1             {v4.8h, v5.8h},  [x13], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x1,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v1.d}[1],  [x1], #8
+        // Move x2/x13 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        ld1             {v6.d}[1],  [x1], #8
+        ext             v3.16b,  v2.16b,  v3.16b,  #10
+        ext             v2.16b,  v1.16b,  v2.16b,  #10
+        ext             v5.16b,  v4.16b,  v5.16b,  #10
+        ext             v4.16b,  v6.16b,  v4.16b,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v1 with the leftmost pixel
+        // and shift v2/v3 to have 3x the first pixel at the front.
+        dup             v1.8h,   v2.h[0]
+        dup             v6.8h,   v4.h[0]
+        // Move x2 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             x2,  x2,  #6
+        sub             x13, x13, #6
+        ext             v3.16b,  v2.16b,  v3.16b,  #10
+        ext             v2.16b,  v1.16b,  v2.16b,  #10
+        ext             v5.16b,  v4.16b,  v5.16b,  #10
+        ext             v4.16b,  v6.16b,  v4.16b,  #10
+
+2:
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w9,  w5,  #14
+        ldr             h27, [x2,  w9, sxtw #1]
+        ldr             h28, [x13, w9, sxtw #1]
+        // Fill v27/v28 with the right padding pixel
+        dup             v27.8h,  v27.h[0]
+        dup             v28.8h,  v28.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro ushll_sz d0, d1, src, shift, wd
+        ushll           \d0\().4s,  \src\().4h,  \shift
+.ifc \wd, .8h
+        ushll2          \d1\().4s,  \src\().8h,  \shift
+.endif
+.endm
+.macro add_sz d0, d1, s0, s1, c, wd
+        add             \d0\().4s,  \s0\().4s,   \c\().4s
+.ifc \wd, .8h
+        add             \d1\().4s,  \s1\().4s,   \c\().4s
+.endif
+.endm
+.macro srshl_sz d0, d1, s0, s1, c, wd
+        srshl           \d0\().4s,  \s0\().4s,   \c\().4s
+.ifc \wd, .8h
+        srshl           \d1\().4s,  \s1\().4s,   \c\().4s
+.endif
+.endm
+.macro sqxtun_sz dst, s0, s1, wd
+        sqxtun          \dst\().4h, \s0\().4s
+.ifc \wd, .8h
+        sqxtun2         \dst\().8h, \s1\().4s
+.endif
+.endm
+
+.macro filter wd
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        ext             v18.16b, v2.16b,  v3.16b, #6
+        ext             v16.16b, v2.16b,  v3.16b, #2
+        ext             v17.16b, v2.16b,  v3.16b, #4
+        ext             v19.16b, v2.16b,  v3.16b, #8
+        ext             v20.16b, v2.16b,  v3.16b, #10
+        ushll_sz        v6,  v7,  v18, #7, \wd
+        ext             v21.16b, v2.16b,  v3.16b, #12
+        smlal           v6.4s,   v2.4h,   v0.h[0]
+        smlal           v6.4s,   v16.4h,  v0.h[1]
+        smlal           v6.4s,   v17.4h,  v0.h[2]
+        smlal           v6.4s,   v18.4h,  v0.h[3]
+        smlal           v6.4s,   v19.4h,  v0.h[4]
+        smlal           v6.4s,   v20.4h,  v0.h[5]
+        smlal           v6.4s,   v21.4h,  v0.h[6]
+.ifc \wd, .8h
+        smlal2          v7.4s,   v2.8h,   v0.h[0]
+        smlal2          v7.4s,   v16.8h,  v0.h[1]
+        smlal2          v7.4s,   v17.8h,  v0.h[2]
+        smlal2          v7.4s,   v18.8h,  v0.h[3]
+        smlal2          v7.4s,   v19.8h,  v0.h[4]
+        smlal2          v7.4s,   v20.8h,  v0.h[5]
+        smlal2          v7.4s,   v21.8h,  v0.h[6]
+.endif
+        ext             v21.16b, v4.16b,  v5.16b, #6
+        ext             v19.16b, v4.16b,  v5.16b, #2
+        ext             v20.16b, v4.16b,  v5.16b, #4
+        ext             v22.16b, v4.16b,  v5.16b, #8
+        ext             v23.16b, v4.16b,  v5.16b, #10
+        ushll_sz        v16, v17, v21, #7, \wd
+        ext             v24.16b, v4.16b,  v5.16b, #12
+        smlal           v16.4s,  v4.4h,   v0.h[0]
+        smlal           v16.4s,  v19.4h,  v0.h[1]
+        smlal           v16.4s,  v20.4h,  v0.h[2]
+        smlal           v16.4s,  v21.4h,  v0.h[3]
+        smlal           v16.4s,  v22.4h,  v0.h[4]
+        smlal           v16.4s,  v23.4h,  v0.h[5]
+        smlal           v16.4s,  v24.4h,  v0.h[6]
+.ifc \wd, .8h
+        smlal2          v17.4s,  v4.8h,   v0.h[0]
+        smlal2          v17.4s,  v19.8h,  v0.h[1]
+        smlal2          v17.4s,  v20.8h,  v0.h[2]
+        smlal2          v17.4s,  v21.8h,  v0.h[3]
+        smlal2          v17.4s,  v22.8h,  v0.h[4]
+        smlal2          v17.4s,  v23.8h,  v0.h[5]
+        smlal2          v17.4s,  v24.8h,  v0.h[6]
+.endif
+        mvni            v24\wd,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+        add_sz          v6,  v7,  v6,  v7,  v30, \wd
+        add_sz          v16, v17, v16, v17, v30, \wd
+        srshl_sz        v6,  v7,  v6,  v7,  v29, \wd
+        srshl_sz        v16, v17, v16, v17, v29, \wd
+        sqxtun_sz       v6,  v6,  v7,  \wd
+        sqxtun_sz       v7,  v16, v17, \wd
+        umin            v6\wd,   v6\wd,   v24\wd
+        umin            v7\wd,   v7\wd,   v24\wd
+        sub             v6\wd,   v6\wd,   v31\wd
+        sub             v7\wd,   v7\wd,   v31\wd
+.endm
+        filter          .8h
+        st1             {v6.8h},  [x0],  #16
+        st1             {v7.8h},  [x12], #16
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v2.16b,  v3.16b
+        mov             v4.16b,  v5.16b
+        ld1             {v3.8h},  [x2],  #16
+        ld1             {v5.8h},  [x13], #16
+        b.ne            4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v3.16b,  v3.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        ext             v5.16b,  v5.16b,  v5.16b, #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in v2-v3
+        cmp             w5,  #5
+        b.lt            7f
+        b.gt            8f
+        // w == 5, 8 pixels valid in v2, v3 invalid
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in v2
+        sub             w9,  w5,  #1
+        // w9 = (pixels valid - 4)
+        adr             x11, L(variable_shift_tbl)
+        ldrh            w9,  [x11, w9, uxtw #1]
+        sub             x11, x11, w9, uxth
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+        br              x11
+44:     // 4 pixels valid in v2/v4, fill the high half with padding.
+        ins             v2.d[1], v3.d[0]
+        ins             v4.d[1], v5.d[0]
+        b               88f
+        // Shift v2 right, shifting out invalid pixels,
+        // shift v2 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #10
+        ext             v2.16b,  v2.16b,  v3.16b,  #6
+        ext             v4.16b,  v4.16b,  v4.16b,  #10
+        ext             v4.16b,  v4.16b,  v5.16b,  #6
+        b               88f
+66:     // 6 pixels valid, fill the upper 2 pixels with padding.
+        ins             v2.s[3], v3.s[0]
+        ins             v4.s[3], v5.s[0]
+        b               88f
+77:     // 7 pixels valid, fill the last pixel with padding.
+        ins             v2.h[7], v3.h[0]
+        ins             v4.h[7], v5.h[0]
+        b               88f
+
+L(variable_shift_tbl):
+        .hword L(variable_shift_tbl) - 44b
+        .hword L(variable_shift_tbl) - 55b
+        .hword L(variable_shift_tbl) - 66b
+        .hword L(variable_shift_tbl) - 77b
+
+8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
+        ins             v27.h[0],  v3.h[0]
+        ins             v28.h[0],  v5.h[0]
+        mov             v3.16b,  v27.16b
+        mov             v5.16b,  v28.16b
+
+88:
+        // w < 7, v2-v3 padded properly
+        cmp             w5,  #4
+        b.lt            888f
+
+        // w >= 4, filter 4 pixels
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+        subs            w5,  w5,  #4 // 0 <= w < 4
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        b.eq            9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        smull           v6.4s,   v2.4h,   v0.4h
+        smull2          v7.4s,   v2.8h,   v0.8h
+        smull           v16.4s,  v4.4h,   v0.4h
+        smull2          v17.4s,  v4.8h,   v0.8h
+        add             v6.4s,   v6.4s,   v7.4s
+        add             v16.4s,  v16.4s,  v17.4s
+        addv            s6,      v6.4s
+        addv            s7,      v16.4s
+        dup             v16.4h,  v2.h[3]
+        ins             v16.h[1], v4.h[3]
+        ins             v6.s[1], v7.s[0]
+        mvni            v24.4h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+        ushll           v16.4s,  v16.4h,  #7
+        add             v6.2s,   v6.2s,   v30.2s
+        add             v6.2s,   v6.2s,   v16.2s
+        srshl           v6.2s,   v6.2s,   v29.2s
+        sqxtun          v6.4h,   v6.4s
+        umin            v6.4h,   v6.4h,   v24.4h
+        sub             v6.4h,   v6.4h,   v31.4h
+        st1             {v6.h}[0], [x0],  #2
+        st1             {v6.h}[1], [x12], #2
+        subs            w5,  w5,  #1
+        ext             v2.16b,  v2.16b,  v3.16b,  #2
+        ext             v4.16b,  v4.16b,  v5.16b,  #2
+        b.gt            888b
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x10
+        add             x12, x12, x10
+        add             x2,  x2,  x3
+        add             x13, x13, x3
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                       const int16_t *mid, int w, int h,
+//                                       const int16_t fv[7], enum LrEdgeFlags edges,
+//                                       ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+        ldr             w8,  [sp]       // bitdepth_max
+        ld1             {v0.8h},  [x5]
+        dup             v31.8h,  w8
+        clz             w8,  w8
+        movi            v1.8h,   #128
+        sub             w8,  w8,  #11   // round_bits_v
+        add             v1.8h,   v1.8h,   v0.8h
+        dup             v30.4s,  w8
+        mov             w8,  w4
+        neg             v30.4s,  v30.4s // -round_bits_v
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             w11, w4
+        tst             w6,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        sub             x2,  x2,  x7,  lsl #1
+        add             w11, w11, #2
+0:
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        add             w11, w11, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into v16-v19 and pad properly.
+        tst             w6,  #4 // LR_HAVE_TOP
+        ld1             {v16.8h}, [x2], x7
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.8h}, [x2], x7
+        mov             v17.16b, v16.16b
+        ld1             {v19.8h}, [x2], x7
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+
+3:
+        cmp             w4,  #4
+        b.lt            5f
+        // Start filtering normally; fill in v20-v22 with unique rows.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        ld1             {v22.8h}, [x2], x7
+
+4:
+.macro filter compare
+        subs            w4,  w4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h,  v0.h[0]
+        smlal           v2.4s,  v17.4h,  v0.h[1]
+        smlal           v2.4s,  v18.4h,  v0.h[2]
+        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v20.4h,  v0.h[4]
+        smlal           v2.4s,  v21.4h,  v0.h[5]
+        smlal           v2.4s,  v22.4h,  v0.h[6]
+        smull2          v3.4s,  v16.8h,  v0.h[0]
+        smlal2          v3.4s,  v17.8h,  v0.h[1]
+        smlal2          v3.4s,  v18.8h,  v0.h[2]
+        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v20.8h,  v0.h[4]
+        smlal2          v3.4s,  v21.8h,  v0.h[5]
+        smlal2          v3.4s,  v22.8h,  v0.h[6]
+        srshl           v2.4s,  v2.4s,   v30.4s // round_bits_v
+        srshl           v3.4s,  v3.4s,   v30.4s
+        sqxtun          v2.4h,  v2.4s
+        sqxtun2         v2.8h,  v3.4s
+        umin            v2.8h,  v2.8h,   v31.8h // bitdepth_max
+        st1             {v2.8h}, [x0], x1
+.if \compare
+        cmp             w4,  #4
+.else
+        b.le            9f
+.endif
+        mov             v16.16b,  v17.16b
+        mov             v17.16b,  v18.16b
+        mov             v18.16b,  v19.16b
+        mov             v19.16b,  v20.16b
+        mov             v20.16b,  v21.16b
+        mov             v21.16b,  v22.16b
+.endm
+        filter          1
+        b.lt            7f
+        ld1             {v22.8h}, [x2], x7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            6f
+        // LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        // We load at least 2 rows in all cases.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        b.gt            53f // 3 rows in total
+        b.eq            52f // 2 rows in total
+51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
+        mov             v22.16b,  v21.16b
+        b               8f
+52:     // 2 rows in total, v19 already loaded, load v20 with content data
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        mov             v23.16b,  v22.16b
+        b               8f
+53:
+        // 3 rows in total, v19 already loaded, load v20 and v21 with content
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        b.gt            63f // 3 rows in total
+        b.eq            62f // 2 rows in total
+61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
+        mov             v20.16b,  v19.16b
+        mov             v21.16b,  v19.16b
+        mov             v22.16b,  v19.16b
+        b               8f
+62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
+        ld1             {v20.8h}, [x2], x7
+        mov             v21.16b,  v20.16b
+        mov             v22.16b,  v20.16b
+        mov             v23.16b,  v20.16b
+        b               8f
+63:
+        // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+        b               8f
+
+7:
+        // All registers up to v21 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+
+8:      // At this point, all registers up to v22-v24 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        mov             v22.16b,  v23.16b
+        mov             v23.16b,  v24.16b
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            w3,  w3,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        msub            x0,  x1,  x8,  x0
+        msub            x2,  x7,  x11, x2
+        add             x0,  x0,  #16
+        add             x2,  x2,  #16
+        mov             w4,  w8
+        b               1b
+
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                   const pixel *src, int w, int h);
+function copy_narrow_16bpc_neon, export=1
+        adr             x5,  L(copy_narrow_tbl)
+        ldrh            w6,  [x5, w3, uxtw #1]
+        sub             x5,  x5,  w6, uxth
+        br              x5
+10:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+18:
+        subs            w4,  w4,  #8
+        b.lt            110f
+        ld1             {v0.8h}, [x2], #16
+        st1             {v0.h}[0], [x0], x1
+        st1             {v0.h}[1], [x7], x1
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[3], [x7], x1
+        st1             {v0.h}[4], [x0], x1
+        st1             {v0.h}[5], [x7], x1
+        st1             {v0.h}[6], [x0], x1
+        st1             {v0.h}[7], [x7], x1
+        b.le            0f
+        b               18b
+110:
+        add             w4,  w4,  #8
+        asr             x1,  x1,  #1
+11:
+        subs            w4,  w4,  #1
+        ld1             {v0.h}[0], [x2], #2
+        st1             {v0.h}[0], [x0], x1
+        b.gt            11b
+0:
+        ret
+
+20:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+24:
+        subs            w4,  w4,  #4
+        b.lt            210f
+        ld1             {v0.4s}, [x2], #16
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x7], x1
+        st1             {v0.s}[2], [x0], x1
+        st1             {v0.s}[3], [x7], x1
+        b.le            0f
+        b               24b
+210:
+        add             w4,  w4,  #4
+        asr             x1,  x1,  #1
+22:
+        subs            w4,  w4,  #1
+        ld1             {v0.s}[0], [x2], #4
+        st1             {v0.s}[0], [x0], x1
+        b.gt            22b
+0:
+        ret
+
+30:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        add             x2,  x2,  #6
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            30b
+        ret
+
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+42:
+        subs            w4,  w4,  #2
+        b.lt            41f
+        ld1             {v0.2d}, [x2], #16
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x7], x1
+        b.le            0f
+        b               42b
+41:
+        ld1             {v0.4h}, [x2]
+        st1             {v0.4h}, [x0]
+0:
+        ret
+
+50:
+        ldr             x5,  [x2]
+        ldrh            w6,  [x2, #8]
+        add             x2,  x2,  #10
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        strh            w6,  [x0, #8]
+        add             x0,  x0,  x1
+        b.gt            50b
+        ret
+
+60:
+        ldr             x5,  [x2]
+        ldr             w6,  [x2, #8]
+        add             x2,  x2,  #12
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        str             w6,  [x0, #8]
+        add             x0,  x0,  x1
+        b.gt            60b
+        ret
+
+70:
+        ldr             x5,  [x2]
+        ldr             w6,  [x2, #8]
+        ldrh            w7,  [x2, #12]
+        add             x2,  x2,  #14
+        subs            w4,  w4,  #1
+        str             x5,  [x0]
+        str             w6,  [x0, #8]
+        strh            w7,  [x0, #12]
+        add             x0,  x0,  x1
+        b.gt            70b
+        ret
+
+L(copy_narrow_tbl):
+        .hword 0
+        .hword L(copy_narrow_tbl) - 10b
+        .hword L(copy_narrow_tbl) - 20b
+        .hword L(copy_narrow_tbl) - 30b
+        .hword L(copy_narrow_tbl) - 40b
+        .hword L(copy_narrow_tbl) - 50b
+        .hword L(copy_narrow_tbl) - 60b
+        .hword L(copy_narrow_tbl) - 70b
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+1:
+        sub             x9,  x9,  w13, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             w13, w5,  #14
+        bic             w13, w13, #7
+        sub             x4,  x4,  w13, uxtw #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #4
+
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ld1             {v18.d}[1], [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 2x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 2 pixels we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 2 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #10
+        b.ge            4f   // If w >= 10, all used input pixels are valid
+        cmp             w5,  #6
+        b.ge            5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro ext_n            dst1, dst2, src1, src2, src3, n, w
+        ext             \dst1,  \src1,  \src2,  \n
+.if \w > 4
+        ext             \dst2,  \src2,  \src3,  \n
+.endif
+.endm
+.macro add_n            dst1, dst2, src1, src2, src3, src4, w
+        add             \dst1,  \src1,  \src3
+.if \w > 4
+        add             \dst2,  \src2,  \src4
+.endif
+.endm
+
+.macro add3 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4, \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add3            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 2 <= w < 6
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in v0
+        sub             w13,  w5,  #2
+        // w13 = (pixels valid - 2)
+        adr             x14, L(box3_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+
+L(box3_variable_shift_tbl):
+        .hword L(box3_variable_shift_tbl) - 22b
+        .hword L(box3_variable_shift_tbl) - 33b
+        .hword L(box3_variable_shift_tbl) - 44b
+        .hword L(box3_variable_shift_tbl) - 55b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+
+        add3            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        add             w14, w5,  #13
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+        add             w14, w5,  #15
+1:
+        sub             x9,  x9,  w13, uxtw #1
+        bic             w14, w14, #7
+        sub             x4,  x4,  w14, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #6
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ld1             {v18.d}[1],  [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 3x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 6 bytes we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 3 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v0.16b,  v1.16b,  #6
+        ext             v27.16b, v0.16b,  v1.16b,  #8
+
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v25\wd,  v25\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v27\wd
+
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        ext             v28.16b, v16.16b, v17.16b, #6
+        ext             v29.16b, v16.16b, v17.16b, #8
+
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v27\wd,  v27\wd,  v28\wd
+        add             v7\wd,   v7\wd,   v29\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4,  \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8,  \w
+        ext_n           v28.16b, v29.16b, v2.16b,  v3.16b,  v4.16b,  #12, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v3.4s,   v4.4s,   \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4,  \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8,  \w
+        ext_n           v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v19.4s,  v20.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add5            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in v0/v4
+        sub             w13,  w5,  #1
+        // w13 = pixels valid - 2
+        adr             x14, L(box5_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        mov             v1.16b,  v30.16b
+        mov             v17.16b, v31.16b
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+66:     // 6 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #12
+        ext             v16.16b, v16.16b, v16.16b, #12
+        ext             v0.16b,  v0.16b,  v30.16b, #4
+        ext             v16.16b, v16.16b, v31.16b, #4
+        b               88f
+77:     // 7 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #14
+        ext             v16.16b, v16.16b, v16.16b, #14
+        ext             v0.16b,  v0.16b,  v30.16b, #2
+        ext             v16.16b, v16.16b, v31.16b, #2
+        b               88f
+
+L(box5_variable_shift_tbl):
+        .hword L(box5_variable_shift_tbl) - 22b
+        .hword L(box5_variable_shift_tbl) - 33b
+        .hword L(box5_variable_shift_tbl) - 44b
+        .hword L(box5_variable_shift_tbl) - 55b
+        .hword L(box5_variable_shift_tbl) - 66b
+        .hword L(box5_variable_shift_tbl) - 77b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        add5            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add5
+endfunc
+
+sgr_funcs 16
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
commit	2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree	b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/dav1d/src/arm/64/looprestoration16.S
parent	Initial commit. (diff)
download	firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip