1 files changed, 1997 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000..75252acfb1
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+        lsr             w11, w2,  #3
+        lsr             w12, w2,  #12
+        lsr             w13, w2,  #1
+        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
+        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
+        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+        lsr             w2,  w2,  #\steps
+.endif
+        and             w11, w11, #((1 << \steps) - 1)    // bit
+.if \shift
+        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
+.else
+        orr             w2,  w2,  w11, lsl #16            // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+        ubfx            \dest,  x2,   #17 - \bits, #\bits
+        lsr             w2,  w2,  #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  0
+        increment_seed  4
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        read_rand       x14, 11,  3
+        ld1             {v0.h}[3], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  2
+        ld1             {v0.h}[4], [x14]
+        add             x15, x3,  x15, lsl #1
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[5], [x15]
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[6], [x14]
+        ld1             {v0.h}[7], [x15]
+        ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+        st1             {\r0\().16b,\r1\().16b}, [x0], #32
+        st1             {\r2\().16b,\r3\().16b}, [x0], #32
+        st1             {\r4\().16b},  [x0], #16
+        st1             {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+        increment_seed  2
+        read_rand       x14, 11,  1
+        read_rand       x15, 11,  0
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        ld1             {v0.h}[1], [x15]
+        srshl           v0.4h,   v0.4h,   v31.4h
+        ret
+endfunc
+
+.macro get_grain_2 dst
+        bl              get_grain_2_neon
+.ifnc \dst, v0
+        mov             \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+        increment_seed  4
+        read_rand       x14, 11,  3
+        read_rand       x15, 11,  2
+        add             x14, x3,  x14, lsl #1
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[0], [x14]
+        read_rand       x14, 11,  1
+        ld1             {v0.h}[1], [x15]
+        add             x14, x3,  x14, lsl #1
+        read_rand       x15, 11,  0
+        add             x15, x3,  x15, lsl #1
+        ld1             {v0.h}[2], [x14]
+        ld1             {v0.h}[3], [x15]
+        srshl           v0.4h,   v0.4h,   v31.4h
+        ret
+endfunc
+
+.macro get_grain_4 dst
+        bl              get_grain_4_neon
+.ifnc \dst, v0
+        mov             \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+        read_shift_rand x13, 11
+        mov             w11, v1.s[0]
+        ldrsh           w12, [x3, x13, lsl #1]
+        ext             v0.16b,  v0.16b,  v0.16b,  #2
+.if \n == 1
+        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
+.elseif \n == 2
+        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w14, w17, w11        // += *coeff * prev output 2
+        mov             w16, w14
+.else
+        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
+        madd            w11, w14, w21, w11        // += *coeff * prev output 3
+        mov             w17, w16
+        mov             w16, w14
+.endif
+        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
+        add             w12, w12, w10             // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+        asr             w14, w14, w7              // >> ar_coeff_shift
+        asr             w12, w12, w9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
+        add             w14, w14, w12
+        cmp             w14, w5
+        csel            w14, w14, w5,  le
+        cmp             w14, w6
+        csel            w14, w14, w6,  ge
+        subs            w15, w15, #1
+        ext             v1.16b,  v1.16b,  v1.16b,  #4
+        ins             v0.h[7], w14
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+        sub             x12, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v18.8h}, [x12] // load top right
+
+        ext             v0.16b,  v16.16b, v17.16b, #14 // top left, top mid
+        ext             v1.16b,  v17.16b, v18.16b, #2  // top mid, top right
+
+        smull           v4.4s,   v17.4h,  v28.4h
+        smlal           v4.4s,   v0.4h,   v27.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smull2          v5.4s,   v17.8h,  v28.8h
+        smlal2          v5.4s,   v0.8h,   v27.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+        bl              sum_\lag\()_above_neon
+.ifc \type, uv_420
+        add             x12, x19, #GRAIN_WIDTH*2
+        ld1             {v22.8h, v23.8h}, [x19], #32
+        ld1             {v24.8h, v25.8h}, [x12]
+        addp            v22.8h,  v22.8h,  v23.8h
+        addp            v23.8h,  v24.8h,  v25.8h
+        add             v22.8h,  v22.8h,  v23.8h
+        srshr           v0.8h,   v22.8h,  #2
+.endif
+.ifc \type, uv_422
+        ld1             {v22.8h, v23.8h}, [x19], #32
+        addp            v22.8h,  v22.8h,  v23.8h
+        srshr           v0.8h,   v22.8h,  #1
+.endif
+.ifc \type, uv_444
+        ld1             {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+        dup             v1.8b,   \uv_coeff
+        sxtl            v1.8h,   v1.8b
+        smlal           v4.4s,   v0.4h,   v1.4h
+        smlal2          v5.4s,   v0.8h,   v1.8h
+.else
+        smlal           v4.4s,   v0.4h,   v30.4h
+        smlal2          v5.4s,   v0.8h,   v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+        b               sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+        b               sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+        increment_seed  4
+        read_rand       x12, 11,  3
+        read_rand       x13, 11,  2
+        read_rand       x14, 11,  1
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v0.h}[5], [x12]
+        ld1             {v0.h}[6], [x13]
+        ld1             {v0.h}[7], [x14]
+        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
+        srshl           v0.8h,   v0.8h,   v31.8h
+        ext             v4.16b,  v4.16b,  v4.16b,  #12
+.ifc \lag, lag3
+        smov            w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+        smov            w16, v0.h[6]
+.endif
+        smov            w14, v0.h[7]
+
+        mov             v1.16b,  v4.16b
+        mov             w15, #1
+        bl              output_\lag\()_neon
+.else
+        increment_seed  4, shift=0
+        mov             v1.16b,  v4.16b
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+
+        increment_seed  4, shift=0
+        mov             v1.16b,  v5.16b
+.ifc \edge, right
+        mov             w15, #3
+        bl              output_\lag\()_neon
+        read_shift_rand x15, 11
+        add             x15, x3,  x15, lsl #1
+        ld1             {v1.h}[0], [x15]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        ext             v0.16b,  v0.16b,  v1.16b,  #2
+.else
+        mov             w15, #4
+        bl              output_\lag\()_neon
+.endif
+.else
+        // elems == 1
+        increment_seed  4, shift=0
+        mov             v1.16b,  v4.16b
+        mov             w15, #1
+        bl              output_\lag\()_neon
+        lsr             w2,  w2,  #3
+
+        read_rand       x12, 11,  2
+        read_rand       x13, 11,  1
+        read_rand       x14, 11,  0
+        add             x12, x3,  x12, lsl #1
+        add             x13, x3,  x13, lsl #1
+        add             x14, x3,  x14, lsl #1
+        ld1             {v1.h}[0], [x12]
+        ld1             {v1.h}[1], [x13]
+        ld1             {v1.h}[2], [x14]
+        srshl           v1.4h,   v1.4h,   v31.4h
+        ext             v0.16b,  v0.16b,  v1.16b,  #14
+.endif
+        st1             {v0.8h}, [x0], #16
+        ldr             x30,     [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x12, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v17.8h}, [x12] // load the previous block right above
+.endif
+        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y,      0,   left
+sum_lag1_func y,      0,   mid
+sum_lag1_func y,      0,   right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v18.8h}, [x12] // load top right
+        ld1             {v21.8h}, [x13]
+
+        dup             v26.8b,  v30.b[0]
+        ext             v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+        dup             v27.8b,  v30.b[1]
+        ext             v23.16b, v16.16b, v17.16b, #14
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v30.b[3]
+        ext             v0.16b,  v17.16b, v18.16b, #2  // top mid, top right
+        sxtl            v27.8h,  v27.8b
+        dup             v29.8b,  v30.b[4]
+        ext             v1.16b,  v17.16b, v18.16b, #4
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+
+        smull           v4.4s,   v22.4h,  v26.4h
+        smlal           v4.4s,   v23.4h,  v27.4h
+        smlal           v4.4s,   v0.4h,   v28.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smull2          v5.4s,   v22.8h,  v26.8h
+        smlal2          v5.4s,   v23.8h,  v27.8h
+        smlal2          v5.4s,   v0.8h,   v28.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        dup             v26.16b, v30.b[5]
+        ext             v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+        dup             v27.16b, v30.b[6]
+        ext             v23.16b, v19.16b, v20.16b, #14
+        sxtl            v26.8h,  v26.8b
+        dup             v28.16b, v30.b[8]
+        ext             v0.16b,  v20.16b, v21.16b, #2  // top mid, top right
+        sxtl            v27.8h,  v27.8b
+        dup             v29.16b, v30.b[9]
+        ext             v1.16b,  v20.16b, v21.16b, #4
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+
+        smlal           v4.4s,   v22.4h,  v26.4h
+        smlal           v4.4s,   v23.4h,  v27.4h
+        smlal           v4.4s,   v0.4h,   v28.4h
+        smlal           v4.4s,   v1.4h,   v29.4h
+        smlal2          v5.4s,   v22.8h,  v26.8h
+        smlal2          v5.4s,   v23.8h,  v27.8h
+        smlal2          v5.4s,   v0.8h,   v28.8h
+        smlal2          v5.4s,   v1.8h,   v29.8h
+
+        dup             v26.16b, v30.b[2]
+        dup             v27.16b, v30.b[7]
+        sxtl            v26.8h,  v26.8b
+        sxtl            v27.8h,  v27.8b
+
+        smlal           v4.4s,   v17.4h,  v26.4h
+        smlal           v4.4s,   v20.4h,  v27.4h
+        smlal2          v5.4s,   v17.8h,  v26.8h
+        smlal2          v5.4s,   v20.8h,  v27.8h
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x12, x0,  #2*GRAIN_WIDTH*2
+        sub             x13, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v17.8h}, [x12] // load the previous block right above
+        ld1             {v20.8h}, [x13]
+.endif
+        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y,      0,   left
+sum_lag2_func y,      0,   mid
+sum_lag2_func y,      0,   right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+        sub             x11, x0,  #3*GRAIN_WIDTH*2 - 16
+        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
+        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
+        ld1             {v15.8h}, [x11] // load top right
+        ld1             {v18.8h}, [x12]
+        ld1             {v21.8h}, [x13]
+
+        dup             v22.8b,  v29.b[0]
+        ext             v8.16b,  v13.16b, v14.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[1]
+        ext             v9.16b,  v13.16b, v14.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v29.b[2]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v29.b[3]
+        ext             v10.16b, v13.16b, v14.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v29.b[4]
+        ext             v11.16b, v14.16b, v15.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v29.b[5]
+        ext             v12.16b, v14.16b, v15.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v29.b[6]
+        ext             v13.16b, v14.16b, v15.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smull           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v14.4h,  v25.4h
+        smull2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v14.8h,  v25.8h
+
+        dup             v22.8b,  v29.b[7]
+        ext             v8.16b,  v16.16b, v17.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[8]
+        ext             v9.16b,  v16.16b, v17.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v29.b[9]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v29.b[10]
+        ext             v10.16b, v16.16b, v17.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v29.b[11]
+        ext             v11.16b, v17.16b, v18.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v29.b[12]
+        ext             v12.16b, v17.16b, v18.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v29.b[13]
+        ext             v13.16b, v17.16b, v18.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smlal           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v17.4h,  v25.4h
+        smlal2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v17.8h,  v25.8h
+
+        dup             v22.8b,  v29.b[14]
+        ext             v8.16b,  v19.16b, v20.16b, #10 // top left, top mid
+        dup             v23.8b,  v29.b[15]
+        ext             v9.16b,  v19.16b, v20.16b, #12
+        sxtl            v22.8h,  v22.8b
+        dup             v24.8b,  v30.b[0]
+        sxtl            v23.8h,  v23.8b
+        dup             v25.8b,  v30.b[1]
+        ext             v10.16b, v19.16b, v20.16b, #14
+        sxtl            v24.8h,  v24.8b
+        dup             v26.8b,  v30.b[2]
+        ext             v11.16b, v20.16b, v21.16b, #2  // top mid, top right
+        sxtl            v25.8h,  v25.8b
+        dup             v27.8b,  v30.b[3]
+        ext             v12.16b, v20.16b, v21.16b, #4
+        sxtl            v26.8h,  v26.8b
+        dup             v28.8b,  v30.b[4]
+        ext             v13.16b, v20.16b, v21.16b, #6
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+
+        smlal           v4.4s,   v8.4h,   v22.4h
+        smlal           v4.4s,   v9.4h,   v23.4h
+        smlal           v4.4s,   v10.4h,  v24.4h
+        smlal           v4.4s,   v11.4h,  v26.4h
+        smlal           v4.4s,   v12.4h,  v27.4h
+        smlal           v4.4s,   v13.4h,  v28.4h
+        smlal           v4.4s,   v20.4h,  v25.4h
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        smlal2          v5.4s,   v8.8h,   v22.8h
+        smlal2          v5.4s,   v9.8h,   v23.8h
+        smlal2          v5.4s,   v10.8h,  v24.8h
+        smlal2          v5.4s,   v11.8h,  v26.8h
+        smlal2          v5.4s,   v12.8h,  v27.8h
+        smlal2          v5.4s,   v13.8h,  v28.8h
+        smlal2          v5.4s,   v20.8h,  v25.8h
+
+        mov             v13.16b, v14.16b
+        mov             v14.16b, v15.16b
+
+        mov             v19.16b, v20.16b
+        mov             v20.16b, v21.16b
+        ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+.ifc \edge, left
+        sub             x11, x0,  #3*GRAIN_WIDTH*2
+        sub             x12, x0,  #2*GRAIN_WIDTH*2
+        sub             x13, x0,  #1*GRAIN_WIDTH*2
+        ld1             {v14.8h}, [x11] // load the previous block right above
+        ld1             {v17.8h}, [x12]
+        ld1             {v20.8h}, [x13]
+.endif
+        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y,      0,   left
+sum_lag3_func y,      0,   mid
+sum_lag3_func y,      0,   right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        mov             w16, #80
+2:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+        subs            w16, w16, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            2b
+        get_grain_2     v0
+        subs            w1,  w1,  #1
+        st1             {v0.s}[0], [x0], #4
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function generate_grain_rows_44_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+1:
+        mov             w16, #40
+2:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+        subs            w16, w16, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            2b
+        get_grain_4     v0
+        subs            w1,  w1,  #1
+        st1             {v0.4h}, [x0]
+        add             x0,  x0,  #GRAIN_WIDTH*2-80
+        b.gt            1b
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+        bl              get_gaussian_neon
+        srshl           v0.8h,   v0.8h,   v31.8h
+gen_grain_uv_lag0_8_add:
+        and             v4.16b,  v4.16b,  v1.16b
+        smull           v2.4s,   v4.4h,   v27.4h
+        smull2          v3.4s,   v4.8h,   v27.8h
+        srshl           v2.4s,   v2.4s,   v28.4s
+        srshl           v3.4s,   v3.4s,   v28.4s
+        sqxtn           v2.4h,   v2.4s
+        sqxtn2          v2.8h,   v3.4s
+        sqadd           v2.8h,   v2.8h,   v0.8h
+        smin            v2.8h,   v2.8h,   v25.8h
+        smax            v2.8h,   v2.8h,   v26.8h
+        st1             {v2.8h}, [x0], #16
+        ldr             x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+        AARCH64_SIGN_LINK_REGISTER
+        add             x12, x19, #GRAIN_WIDTH*2
+        str             x30, [sp, #-16]!
+        ld1             {v16.8h, v17.8h}, [x19], #32
+        ld1             {v18.8h, v19.8h}, [x12]
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v17.8h,  v18.8h,  v19.8h
+        add             v16.8h,  v16.8h,  v17.8h
+        srshr           v4.8h,   v16.8h,  #2
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.8h, v17.8h}, [x19], #32
+        addp            v16.8h,  v16.8h,  v17.8h
+        srshr           v4.8h,   v16.8h,  #1
+        b               gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+        add             x12, x19, #GRAIN_WIDTH*2
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.4h, v17.4h}, [x19]
+        ld1             {v18.4h, v19.4h}, [x12]
+        add             x19,  x19,  #32
+        addp            v16.4h,  v16.4h,  v17.4h
+        addp            v17.4h,  v18.4h,  v19.4h
+        add             v16.4h,  v16.4h,  v17.4h
+        srshr           v4.4h,   v16.4h,  #2
+        get_grain_4     v0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-16]!
+        ld1             {v16.4h, v17.4h}, [x19]
+        add             x19,  x19,  #32
+        addp            v16.4h,  v16.4h,  v17.4h
+        srshr           v4.4h,   v16.4h,  #1
+        get_grain_4     v0
+        b               gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #3*GRAIN_WIDTH*2
+        mov             x1,  x2
+        mul             w13, w13, w14
+        clz             w15, w4
+.else
+        clz             w15, w2
+.endif
+        movrel          x3,  X(gaussian_sequence)
+        sub             w15, w15, #24 // -bitdepth_min_8
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+        add             x4,  x1,  #FGD_AR_COEFFS_Y
+.else
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+.endif
+        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+.ifc \type, uv_444
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+.endif
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        neg             w15, w15            // bitdepth_min_8
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #128
+        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
+        neg             w6,  w5             // -(128 << bitpdeth_min_8)
+        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+        eor             w2,  w2,  w11
+.endif
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+        mov             w1,  #GRAIN_HEIGHT
+        bl              generate_grain_rows_neon
+.else
+        dup             v28.4s,  w7
+        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        dup             v25.8h,  w5
+        dup             v26.8h,  w6
+        ext             v29.16b, v0.16b,  v1.16b,  #10
+        ext             v30.16b, v1.16b,  v0.16b,  #2
+        neg             v28.4s,  v28.4s
+        sxtl            v27.8h,  v27.8b
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+        mov             w1,  #GRAIN_HEIGHT-3
+1:
+        mov             v1.16b,  v29.16b
+        bl              gen_grain_uv_444_lag0_neon // 8
+        movi            v1.16b,  #255
+        bl              gen_grain_uv_444_lag0_neon // 16
+        bl              gen_grain_uv_444_lag0_neon // 24
+        bl              gen_grain_uv_444_lag0_neon // 32
+        bl              gen_grain_uv_444_lag0_neon // 40
+        bl              gen_grain_uv_444_lag0_neon // 48
+        bl              gen_grain_uv_444_lag0_neon // 56
+        bl              gen_grain_uv_444_lag0_neon // 64
+        bl              gen_grain_uv_444_lag0_neon // 72
+        mov             v1.16b,  v30.16b
+        bl              gen_grain_uv_444_lag0_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+        add             x19, x19, #4
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+.endif
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_y[0]
+        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_y[1]
+        ld1r            {v29.8b}, [x4]      // ar_coeffs_y[2]
+.ifc \type, y
+        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
+.else
+        add             x4,  x4,  #2
+.endif
+
+        mov             w1,  #3
+.ifc \type, uv_444
+        ld1r            {v30.8b}, [x4]      // ar_coeffs_uv[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+.endif
+        bl              generate_grain_rows_neon
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+.ifc \type, uv_444
+        sxtl            v30.8h,  v30.8b
+.endif
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_mid_neon   // 48
+        bl              sum_\type\()_lag1_mid_neon   // 56
+        bl              sum_\type\()_lag1_mid_neon   // 64
+        bl              sum_\type\()_lag1_mid_neon   // 72
+        bl              sum_\type\()_lag1_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_mid_neon   // 48
+        bl              sum_\type\()_lag2_mid_neon   // 56
+        bl              sum_\type\()_lag2_mid_neon   // 64
+        bl              sum_\type\()_lag2_mid_neon   // 72
+        bl              sum_\type\()_lag2_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_neon
+
+        mov             w1,  #GRAIN_HEIGHT - 3
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_mid_neon   // 48
+        bl              sum_\type\()_lag3_mid_neon   // 56
+        bl              sum_\type\()_lag3_mid_neon   // 64
+        bl              sum_\type\()_lag3_mid_neon   // 72
+        bl              sum_\type\()_lag3_right_neon // 80
+        get_grain_2     v16
+        subs            w1,  w1,  #1
+.ifc \type, uv_444
+        add             x19, x19, #4
+.endif
+        st1             {v16.s}[0], [x0], #4
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+        mov             \dst,  #SUB_GRAIN_HEIGHT-3
+.else
+        mov             \dst,  #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x30, x19, [sp, #-96]!
+
+        mov             w13, w3
+        mov             w14, #28
+        add             x19, x1,  #(3*GRAIN_WIDTH-3)*2
+        mov             x1,  x2
+        mul             w13, w13, w14
+        clz             w15, w4
+
+        movrel          x3,  X(gaussian_sequence)
+        sub             w15, w15, #24 // -bitdepth_min_8
+        ldr             w2,  [x1, #FGD_SEED]
+        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
+        add             x4,  x1,  #FGD_AR_COEFFS_UV
+        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
+        adr             x16, L(gen_grain_\type\()_tbl)
+        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
+        add             w9,  w9,  #4
+        ldrh            w17, [x16, w17, uxtw #1]
+        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
+        sub             x16, x16, w17, uxtw
+        neg             v31.8h,  v31.8h
+
+        cmp             w13, #0
+        mov             w11, #0x49d8
+        mov             w14, #0xb524
+        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
+        csel            w11, w11, w14, ne
+
+        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
+        neg             w15, w15            // bitdepth_min_8
+        mov             w8,  #1
+        mov             w10, #1
+        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
+        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
+        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
+        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
+        mov             w5,  #128
+        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
+        neg             w6,  w5             // -(128 << bitpdeth_min_8)
+        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
+
+        eor             w2,  w2,  w11
+
+        br              x16
+
+L(generate_grain_\type\()_lag0):
+        AARCH64_VALID_JUMP_TARGET
+        dup             v28.4s,  w7
+        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
+        movi            v0.16b,  #0
+        movi            v1.16b,  #255
+        dup             v25.8h,  w5
+        dup             v26.8h,  w6
+        ext             v29.16b, v0.16b,  v1.16b,  #10
+        ext             v30.16b, v1.16b,  v0.16b,  #14
+        neg             v28.4s,  v28.4s
+        sxtl            v27.8h,  v27.8b
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+        set_height      w1,  \type
+1:
+        mov             v1.16b,  v29.16b
+        bl              gen_grain_\type\()_lag0_8_neon // 8
+        movi            v1.16b,  #255
+        bl              gen_grain_\type\()_lag0_8_neon // 16
+        bl              gen_grain_\type\()_lag0_8_neon // 24
+        bl              gen_grain_\type\()_lag0_8_neon // 32
+        bl              gen_grain_\type\()_lag0_8_neon // 40
+        mov             v1.16b,  v30.16b
+        bl              gen_grain_\type\()_lag0_4_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag1):
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_uv[0]
+        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_uv[1]
+        ld1r            {v29.8b}, [x4]      // ar_coeffs_uv[2]
+        add             x4,  x4,  #2
+
+        mov             w1,  #3
+        ld1r            {v30.8b}, [x4]      // ar_coeffs_u4[4]
+        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
+        bl              generate_grain_rows_44_neon
+
+        sxtl            v27.8h,  v27.8b
+        sxtl            v28.8h,  v28.8b
+        sxtl            v29.8h,  v29.8b
+        sxtl            v30.8h,  v30.8b
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag1_left_neon  // 8
+        bl              sum_\type\()_lag1_mid_neon   // 16
+        bl              sum_\type\()_lag1_mid_neon   // 24
+        bl              sum_\type\()_lag1_mid_neon   // 32
+        bl              sum_\type\()_lag1_mid_neon   // 40
+        bl              sum_\type\()_lag1_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag2):
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
+
+        smov            w4,  v30.b[10]
+        smov            w17, v30.b[11]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag2_left_neon  // 8
+        bl              sum_\type\()_lag2_mid_neon   // 16
+        bl              sum_\type\()_lag2_mid_neon   // 24
+        bl              sum_\type\()_lag2_mid_neon   // 32
+        bl              sum_\type\()_lag2_mid_neon   // 40
+        bl              sum_\type\()_lag2_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(generate_grain_\type\()_lag3):
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
+        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        stp             x20, x21, [sp, #80]
+
+        smov            w4,  v30.b[5]
+        smov            w20, v30.b[6]
+        smov            w21, v30.b[7]
+
+        mov             w1,  #3
+        bl              generate_grain_rows_44_neon
+
+        set_height      w1,  \type
+1:
+        bl              sum_\type\()_lag3_left_neon  // 8
+        bl              sum_\type\()_lag3_mid_neon   // 16
+        bl              sum_\type\()_lag3_mid_neon   // 24
+        bl              sum_\type\()_lag3_mid_neon   // 32
+        bl              sum_\type\()_lag3_mid_neon   // 40
+        bl              sum_\type\()_lag3_right_neon // 44
+        subs            w1,  w1,  #1
+        increment_y_ptr x19, \type
+        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
+        b.gt            1b
+
+        ldp             x20, x21, [sp, #80]
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldp             x30, x19, [sp], #96
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(gen_grain_\type\()_tbl):
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+        umov            w14, \src1[0]
+        umov            w15, \src2[1]
+        umov            w16, \src1[2]
+        add             x14, x14, x3
+        umov            w17, \src2[3]
+        add             x15, x15, x3
+        ld1             {\dst1}[0+\off], [x14]
+        umov            w14, \src1[4]
+        add             x16, x16, x3
+        ld1             {\dst2}[1+\off], [x15]
+        umov            w15, \src2[5]
+        add             x17, x17, x3
+        ld1             {\dst1}[2+\off], [x16]
+        umov            w16, \src1[6]
+        add             x14, x14, x3
+        ld1             {\dst2}[3+\off], [x17]
+        umov            w17, \src2[7]
+        add             x15, x15, x3
+        ld1             {\dst1}[4+\off], [x14]
+        add             x16, x16, x3
+        ld1             {\dst2}[5+\off], [x15]
+        add             x17, x17, x3
+        ld1             {\dst1}[6+\off], [x16]
+        ld1             {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+        gather_interleaved \dst1, \dst2, \src1, \src3, 0
+        gather_interleaved \dst2, \dst1, \src3, \src1, 0
+        gather_interleaved \dst1, \dst2, \src2, \src4, 8
+        gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+        gather          v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+        ret
+endfunc
+
+function gather16_neon
+        gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+        gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+        ins             v6.d[1], v7.d[0]
+        ret
+endfunc
+
+const overlap_coeffs_0, align=4
+        .short 27, 17, 0,  0
+        .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+        .short 23, 0,  0,  0
+        .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+        and             \offy, \src,  #0xF     // randval & 0xF
+        lsr             \offx, \src,  #4       // randval >> 4
+.if \sy == 0
+        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+        add             \offx, \offx, \offx    // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+        add             \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+//                                 const ptrdiff_t stride,
+//                                 const uint8_t scaling[SCALING_SIZE],
+//                                 const int scaling_shift,
+//                                 const entry grain_lut[][GRAIN_WIDTH],
+//                                 const int offsets[][2],
+//                                 const int h, const ptrdiff_t clip,
+//                                 const ptrdiff_t type,
+//                                 const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30, [sp, #-80]!
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        str             d14,      [sp, #64]
+        eor             w4,  w4,  #15          // 15 - scaling_shift
+        ldr             w11, [x6, #8]          // offsets[1][0]
+        ldr             w13, [x6, #4]          // offsets[0][1]
+        ldr             w15, [x6, #12]         // offsets[1][1]
+        ldr             w10, [sp, #96]         // bitdepth_max
+        ldr             w6,  [x6]              // offsets[0][0]
+        dup             v26.8h,  w10           // bitdepth_max
+        clz             w10, w10
+        ldr             w8,  [sp, #80]         // clip
+        sub             w10, w10, #24          // -bitdepth_min_8
+        mov             x9,  #GRAIN_WIDTH*2    // grain_lut stride
+        neg             w10, w10               // bitdepth_min_8
+
+        dup             v29.8h,  w4            // 15 - scaling_shift
+        dup             v27.8h,  w10           // bitdepth_min_8
+
+        movrel          x16, overlap_coeffs_0
+
+        cbz             w8,  1f
+        // clip
+        movi            v30.8h,  #16
+        movi            v31.8h,  #235
+        sshl            v30.8h,  v30.8h,  v27.8h
+        sshl            v31.8h,  v31.8h,  v27.8h
+        b               2f
+1:
+        // no clip
+        movi            v30.8h,  #0
+        mov             v31.16b, v26.16b       // bitdepth_max
+2:
+
+        ushr            v26.8h,  v26.8h,  #1   // grain_max
+        not             v25.16b, v26.16b       // grain_min
+
+        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+        add             x5,  x5,  #18          // grain_lut += 9
+        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x9           // grain_lut += grain_stride
+
+        calc_offset     w11, w12, w11, 0,  0
+        calc_offset     w13, w14, w13, 0,  0
+        calc_offset     w15, w16, w15, 0,  0
+        calc_offset     w6,  w10, w6,  0,  0
+
+        add_offset      x12, w11, x12, x5,  x9
+        add_offset      x14, w13, x14, x5,  x9
+        add_offset      x16, w15, x16, x5,  x9
+        add_offset      x5,  w6,  x10, x5,  x9
+
+        ldr             w11, [sp, #88]         // type
+        adr             x13, L(fgy_loop_tbl)
+
+        add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
+        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+        tst             w11, #1
+        ldrh            w11, [x13, w11, uxtw #1]
+
+        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
+
+        sub             x11, x13, w11, uxtw
+
+        b.eq            1f
+        // y overlap
+        dup             v8.8h,   v27.h[0]
+        dup             v9.8h,   v27.h[1]
+        mov             w10, w7                // backup actual h
+        mov             w7,  #2
+1:
+        br              x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
+.if \ox
+        ld1             {v20.4h},                         [x4],  x9 // grain_lut old
+.endif
+.if \oy
+        ld1             {v21.8h, v22.8h, v23.8h, v24.8h}, [x6],  x9 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v14.4h},                         [x8],  x9 // grain_lut top old
+.endif
+        mvni            v4.8h,   #0xf0, lsl #8 // 0x0fff
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x9 // grain_lut
+
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v4.16b
+        and             v1.16b,  v1.16b,  v4.16b
+        and             v2.16b,  v2.16b,  v4.16b
+        and             v3.16b,  v3.16b,  v4.16b
+        bl              gather32_neon
+
+.if \ox
+        smull           v20.4s,  v20.4h,  v27.4h
+        smlal           v20.4s,  v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v14.4s,  v14.4h,  v27.4h
+        smlal           v14.4s,  v21.4h,  v28.4h
+        sqrshrn         v20.4h,  v20.4s,  #5
+        sqrshrn         v14.4h,  v14.4s,  #5
+        smin            v20.4h,  v20.4h,  v26.4h
+        smin            v14.4h,  v14.4h,  v26.4h
+        smax            v20.4h,  v20.4h,  v25.4h
+        smax            v14.4h,  v14.4h,  v25.4h
+.endif
+
+.if \ox
+        smull           v10.4s,  v20.4h,  v9.4h
+.else
+        smull           v10.4s,  v16.4h,  v9.4h
+.endif
+        smull2          v11.4s,  v16.8h,  v9.8h
+        smull           v12.4s,  v17.4h,  v9.4h
+        smull2          v13.4s,  v17.8h,  v9.8h
+        smull           v16.4s,  v18.4h,  v9.4h
+        smull2          v17.4s,  v18.8h,  v9.8h
+        smull           v18.4s,  v19.4h,  v9.4h
+        smull2          v19.4s,  v19.8h,  v9.8h
+.if \ox
+        smlal           v10.4s,  v14.4h,  v8.4h
+.else
+        smlal           v10.4s,  v21.4h,  v8.4h
+.endif
+        smlal2          v11.4s,  v21.8h,  v8.8h
+        smlal           v12.4s,  v22.4h,  v8.4h
+        smlal2          v13.4s,  v22.8h,  v8.8h
+        smlal           v16.4s,  v23.4h,  v8.4h
+        smlal2          v17.4s,  v23.8h,  v8.8h
+        smlal           v18.4s,  v24.4h,  v8.4h
+        smlal2          v19.4s,  v24.8h,  v8.8h
+        sqrshrn         v10.4h,  v10.4s,  #5
+        sqrshrn2        v10.8h,  v11.4s,  #5
+        sqrshrn         v11.4h,  v12.4s,  #5
+        sqrshrn2        v11.8h,  v13.4s,  #5
+        sqrshrn         v12.4h,  v16.4s,  #5
+        sqrshrn2        v12.8h,  v17.4s,  #5
+        sqrshrn         v13.4h,  v18.4s,  #5
+        sqrshrn2        v13.8h,  v19.4s,  #5
+        smin            v16.8h,  v10.8h,  v26.8h
+        smin            v17.8h,  v11.8h,  v26.8h
+        smin            v18.8h,  v12.8h,  v26.8h
+        smin            v19.8h,  v13.8h,  v26.8h
+        smax            v16.8h,  v16.8h,  v25.8h
+        smax            v17.8h,  v17.8h,  v25.8h
+        smax            v18.8h,  v18.8h,  v25.8h
+        smax            v19.8h,  v19.8h,  v25.8h
+.endif
+
+        uxtl            v4.8h,   v6.8b            // scaling
+.if \ox && !\oy
+        sqrshrn         v20.4h,  v20.4s,  #5
+.endif
+        uxtl2           v5.8h,   v6.16b
+.if \ox && !\oy
+        smin            v20.4h,  v20.4h,  v26.4h
+.endif
+        uxtl            v6.8h,   v7.8b
+.if \ox && !\oy
+        smax            v20.4h,  v20.4h,  v25.4h
+.endif
+        uxtl2           v7.8h,   v7.16b
+.if \ox && !\oy
+        ins             v16.d[0], v20.d[0]
+.endif
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+        ushl            v6.8h,   v6.8h,   v29.8h
+        ushl            v7.8h,   v7.8h,   v29.8h
+
+        sqrdmulh        v20.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v21.8h,  v17.8h,  v5.8h
+        sqrdmulh        v22.8h,  v18.8h,  v6.8h
+        sqrdmulh        v23.8h,  v19.8h,  v7.8h
+
+        usqadd          v0.8h,   v20.8h           // *src + noise
+        usqadd          v1.8h,   v21.8h
+        usqadd          v2.8h,   v22.8h
+        usqadd          v3.8h,   v23.8h
+
+        umax            v0.8h,   v0.8h,   v30.8h
+        umax            v1.8h,   v1.8h,   v30.8h
+        umax            v2.8h,   v2.8h,   v30.8h
+        umax            v3.8h,   v3.8h,   v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+
+        subs            w7,  w7,  #1
+.if \oy
+        dup             v8.8h,   v28.h[0]
+        dup             v9.8h,   v28.h[1]
+.endif
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w10, #2
+        sub             w7,  w10, #2           // restore actual remaining h
+        b.gt            L(loop_\ox\()0)
+.endif
+        ldr             d14,      [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30, [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+.endm
+
+        fgy             0, 0
+        fgy             0, 1
+        fgy             1, 0
+        fgy             1, 1
+
+L(fgy_loop_tbl):
+        .hword L(fgy_loop_tbl) - L(loop_00)
+        .hword L(fgy_loop_tbl) - L(loop_01)
+        .hword L(fgy_loop_tbl) - L(loop_10)
+        .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+//                                      const pixel *const src,
+//                                      const ptrdiff_t stride,
+//                                      const uint8_t scaling[SCALING_SIZE],
+//                                      const Dav1dFilmGrainData *const data,
+//                                      const entry grain_lut[][GRAIN_WIDTH],
+//                                      const pixel *const luma_row,
+//                                      const ptrdiff_t luma_stride,
+//                                      const int offsets[][2],
+//                                      const ptrdiff_t h, const ptrdiff_t uv,
+//                                      const ptrdiff_t is_id,
+//                                      const ptrdiff_t type,
+//                                      const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        str             x30,      [sp, #-80]!
+        stp             d8,  d9,  [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+
+        ldp             x8,  x9,  [sp, #80]    // offsets, h
+        ldp             x10, x11, [sp, #96]    // uv, is_id
+        ldr             w16,      [sp, #120]   // bitdepth_max
+
+        ldr             w13, [x4, #FGD_SCALING_SHIFT]
+        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+        dup             v23.8h,  w16           // bitdepth_max
+        clz             w16, w16
+        eor             w13, w13, #15          // 15 - scaling_shift
+        sub             w16, w16, #24          // -bitdepth_min_8
+
+        // !csfl
+        add             x10, x4,  x10, lsl #2  // + 4*uv
+        add             x14, x10, #FGD_UV_LUMA_MULT
+        add             x15, x10, #FGD_UV_MULT
+        add             x10, x10, #FGD_UV_OFFSET
+        neg             w16, w16               // bitdepth_min_8
+        ld1r            {v8.8h},  [x14]        // uv_luma_mult
+        ld1r            {v24.8h}, [x10]        // uv_offset
+        ld1r            {v9.8h},  [x15]        // uv_mult
+
+        dup             v29.8h,  w13           // 15 - scaling_shift
+        dup             v27.8h,  w16           // bitdepth_min_8
+
+        cbz             w12, 1f
+        // clip
+        movi            v30.8h,  #16
+        movi            v31.8h,  #240
+        sshl            v30.8h,  v30.8h,  v27.8h
+        sshl            v31.8h,  v31.8h,  v27.8h
+        cbz             w11, 2f
+        // is_id
+        movi            v31.8h,  #235
+        sshl            v31.8h,  v31.8h,  v27.8h
+        b               2f
+1:
+        // no clip
+        movi            v30.8h,  #0
+        mov             v31.16b, v23.16b       // bitdepth_max
+2:
+
+        ushr            v15.8h,  v23.8h,  #1   // grain_max
+        sshl            v24.8h,  v24.8h,  v27.8h // uv_offset << bitdepth_min_8
+        not             v14.16b, v15.16b       // grain_min
+
+        ldr             w12, [x8, #8]          // offsets[1][0]
+        ldr             w14, [x8, #4]          // offsets[0][1]
+        ldr             w16, [x8, #12]         // offsets[1][1]
+        ldr             w8,  [x8]              // offsets[0][0]
+
+        mov             x10, #GRAIN_WIDTH*2    // grain_lut stride
+
+        add             x5,  x5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
+        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
+.else
+        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
+        add             x5,  x5,  x10          // grain_lut += grain_stride
+.endif
+
+        calc_offset     w12, w13, w12, \sx, \sy
+        calc_offset     w14, w15, w14, \sx, \sy
+        calc_offset     w16, w17, w16, \sx, \sy
+        calc_offset     w8,  w11, w8,  \sx, \sy
+
+        add_offset      x13, w12, x13, x5,  x10
+        add_offset      x15, w14, x15, x5,  x10
+        add_offset      x17, w16, x17, x5,  x10
+        add_offset      x5,  w8,  x11, x5,  x10
+
+        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
+        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
+
+        ldr             w13, [sp, #112]        // type
+
+        movrel          x16, overlap_coeffs_\sx
+        adr             x14, L(fguv_loop_sx\sx\()_tbl)
+
+        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
+        tst             w13, #1
+        ldrh            w13, [x14, w13, uxtw #1]
+
+        b.eq            1f
+        // y overlap
+        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
+        mov             w9,  #(2 >> \sy)
+
+1:
+        sub             x13, x14, w13, uxtw
+
+.if \sy
+        movi            v25.8h,  #23
+        movi            v26.8h,  #22
+.else
+        movi            v25.8h,  #27
+        movi            v26.8h,  #17
+.endif
+
+.if \sy
+        add             x7,  x7,  x7           // luma_stride *= 2
+.endif
+
+        br              x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+        ld1             {v4.4h}, [x4],  x10  // grain_lut old
+.endif
+.if \oy
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v5.4h}, [x11], x10  // grain_lut top old
+.endif
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x10 // grain_lut
+
+.if \ox
+        smull           v4.4s,   v4.4h,   v27.4h
+        smlal           v4.4s,   v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v5.4s,   v5.4h,   v27.4h
+        smlal           v5.4s,   v0.4h,   v28.4h
+        sqrshrn         v4.4h,   v4.4s,   #5
+        sqrshrn         v5.4h,   v5.4s,   #5
+        smin            v4.4h,   v4.4h,   v15.4h
+        smin            v5.4h,   v5.4h,   v15.4h
+        smax            v4.4h,   v4.4h,   v14.4h
+        smax            v5.4h,   v5.4h,   v14.4h
+        ins             v16.d[0], v4.d[0]
+        ins             v0.d[0],  v5.d[0]
+.endif
+
+        smull           v6.4s,   v16.4h,  v26.4h
+        smull2          v7.4s,   v16.8h,  v26.8h
+        smull           v10.4s,  v17.4h,  v26.4h
+        smull2          v11.4s,  v17.8h,  v26.8h
+        smull           v16.4s,  v18.4h,  v26.4h
+        smull2          v17.4s,  v18.8h,  v26.8h
+        smull           v18.4s,  v19.4h,  v26.4h
+        smull2          v19.4s,  v19.8h,  v26.8h
+        smlal           v6.4s,   v0.4h,   v25.4h
+        smlal2          v7.4s,   v0.8h,   v25.8h
+        smlal           v10.4s,  v1.4h,   v25.4h
+        smlal2          v11.4s,  v1.8h,   v25.8h
+        smlal           v16.4s,  v2.4h,   v25.4h
+        smlal2          v17.4s,  v2.8h,   v25.8h
+        smlal           v18.4s,  v3.4h,   v25.4h
+        smlal2          v19.4s,  v3.8h,   v25.8h
+        sqrshrn         v6.4h,   v6.4s,   #5
+        sqrshrn2        v6.8h,   v7.4s,   #5
+        sqrshrn         v7.4h,   v10.4s,  #5
+        sqrshrn2        v7.8h,   v11.4s,  #5
+        sqrshrn         v10.4h,  v16.4s,  #5
+        sqrshrn2        v10.8h,  v17.4s,  #5
+        sqrshrn         v11.4h,  v18.4s,  #5
+        sqrshrn2        v11.8h,  v19.4s,  #5
+.endif
+
+.if \ox && !\oy
+        sqrshrn         v4.4h,   v4.4s,   #5
+        smin            v4.4h,   v4.4h,   v15.4h
+.endif
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
+.if \oy
+        smin            v16.8h,  v6.8h,   v15.8h
+        smin            v17.8h,  v7.8h,   v15.8h
+        smin            v18.8h,  v10.8h,  v15.8h
+        smin            v19.8h,  v11.8h,  v15.8h
+        smax            v16.8h,  v16.8h,  v14.8h
+        smax            v17.8h,  v17.8h,  v14.8h
+        smax            v18.8h,  v18.8h,  v14.8h
+        smax            v19.8h,  v19.8h,  v14.8h
+.endif
+
+.if \ox && !\oy
+        smax            v4.4h,   v4.4h,   v14.4h
+.endif
+        ld1             {v10.8h, v11.8h, v12.8h, v13.8h}, [x1],  x2 // src
+.if \ox && !\oy
+        ins             v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+        smull           v4.4s,   v0.4h,   v8.4h
+        smull2          v5.4s,   v0.8h,   v8.8h
+        smull           v6.4s,   v1.4h,   v8.4h
+        smull2          v7.4s,   v1.8h,   v8.8h
+        smull           v0.4s,   v2.4h,   v8.4h
+        smull2          v1.4s,   v2.8h,   v8.8h
+        smull           v2.4s,   v3.4h,   v8.4h
+        smull2          v3.4s,   v3.8h,   v8.8h
+        smlal           v4.4s,   v10.4h,  v9.4h
+        smlal2          v5.4s,   v10.8h,  v9.8h
+        smlal           v6.4s,   v11.4h,  v9.4h
+        smlal2          v7.4s,   v11.8h,  v9.8h
+        smlal           v0.4s,   v12.4h,  v9.4h
+        smlal2          v1.4s,   v12.8h,  v9.8h
+        smlal           v2.4s,   v13.4h,  v9.4h
+        smlal2          v3.4s,   v13.8h,  v9.8h
+        shrn            v4.4h,   v4.4s,   #6
+        shrn2           v4.8h,   v5.4s,   #6
+        shrn            v5.4h,   v6.4s,   #6
+        shrn2           v5.8h,   v7.4s,   #6
+        shrn            v6.4h,   v0.4s,   #6
+        shrn2           v6.8h,   v1.4s,   #6
+        shrn            v7.4h,   v2.4s,   #6
+        shrn2           v7.8h,   v3.4s,   #6
+        add             v0.8h,   v4.8h,   v24.8h
+        add             v1.8h,   v5.8h,   v24.8h
+        add             v2.8h,   v6.8h,   v24.8h
+        add             v3.8h,   v7.8h,   v24.8h
+        movi            v20.8h,  #0
+        smin            v0.8h,   v0.8h,   v23.8h
+        smin            v1.8h,   v1.8h,   v23.8h
+        smin            v2.8h,   v2.8h,   v23.8h
+        smin            v3.8h,   v3.8h,   v23.8h
+        smax            v0.8h,   v0.8h,   v20.8h
+        smax            v1.8h,   v1.8h,   v20.8h
+        smax            v2.8h,   v2.8h,   v20.8h
+        smax            v3.8h,   v3.8h,   v20.8h
+.else
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v23.16b
+        and             v1.16b,  v1.16b,  v23.16b
+        and             v2.16b,  v2.16b,  v23.16b
+        and             v3.16b,  v3.16b,  v23.16b
+.endif
+
+        bl              gather32_neon
+
+        uxtl            v4.8h,   v6.8b            // scaling
+        uxtl2           v5.8h,   v6.16b
+        uxtl            v6.8h,   v7.8b
+        uxtl2           v7.8h,   v7.16b
+
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+        ushl            v6.8h,   v6.8h,   v29.8h
+        ushl            v7.8h,   v7.8h,   v29.8h
+
+        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v17.8h,  v17.8h,  v5.8h
+        sqrdmulh        v18.8h,  v18.8h,  v6.8h
+        sqrdmulh        v19.8h,  v19.8h,  v7.8h
+
+        usqadd          v10.8h,  v16.8h           // *src + noise
+        usqadd          v11.8h,  v17.8h
+        usqadd          v12.8h,  v18.8h
+        usqadd          v13.8h,  v19.8h
+
+        umax            v0.8h,   v10.8h,  v30.8h
+        umax            v1.8h,   v11.8h,  v30.8h
+        umax            v2.8h,   v12.8h,  v30.8h
+        umax            v3.8h,   v13.8h,  v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+
+        subs            w9,  w9,  #1
+.if \oy
+        dup             v25.8h,  v28.h[0]
+        dup             v26.8h,  v28.h[1]
+.endif
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+        b               9f
+.endm
+        fguv_loop_sx0   0, 0, 0
+        fguv_loop_sx0   0, 0, 1
+        fguv_loop_sx0   0, 1, 0
+        fguv_loop_sx0   0, 1, 1
+        fguv_loop_sx0   1, 0, 0
+        fguv_loop_sx0   1, 0, 1
+        fguv_loop_sx0   1, 1, 0
+        fguv_loop_sx0   1, 1, 1
+
+9:
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30,      [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx0_tbl):
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+        AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+        ld1             {v18.4h}, [x4],  x10  // grain_lut old
+.endif
+.if \oy
+        ld1             {v20.8h, v21.8h},  [x8],  x10 // grain_lut top
+.endif
+.if \ox && \oy
+        ld1             {v19.4h}, [x11], x10  // grain_lut top old
+.endif
+        ld1             {v16.8h, v17.8h}, [x5],  x10 // grain_lut
+
+.if \ox
+        smull           v18.4s,  v18.4h,  v27.4h
+        smlal           v18.4s,  v16.4h,  v28.4h
+.endif
+
+.if \oy
+.if \ox
+        smull           v19.4s,  v19.4h,  v27.4h
+        smlal           v19.4s,  v20.4h,  v28.4h
+        sqrshrn         v18.4h,  v18.4s,  #5
+        sqrshrn         v19.4h,  v19.4s,  #5
+        smin            v18.4h,  v18.4h,  v15.4h
+        smin            v19.4h,  v19.4h,  v15.4h
+        smax            v18.4h,  v18.4h,  v14.4h
+        smax            v19.4h,  v19.4h,  v14.4h
+        ins             v16.d[0], v18.d[0]
+        ins             v20.d[0], v19.d[0]
+.endif
+
+        smull           v0.4s,   v16.4h,  v26.4h
+        smull2          v1.4s,   v16.8h,  v26.8h
+        smull           v2.4s,   v17.4h,  v26.4h
+        smull2          v3.4s,   v17.8h,  v26.8h
+        smlal           v0.4s,   v20.4h,  v25.4h
+        smlal2          v1.4s,   v20.8h,  v25.8h
+        smlal           v2.4s,   v21.4h,  v25.4h
+        smlal2          v3.4s,   v21.8h,  v25.8h
+        sqrshrn         v16.4h,  v0.4s,   #5
+        sqrshrn2        v16.8h,  v1.4s,   #5
+        sqrshrn         v17.4h,  v2.4s,   #5
+        sqrshrn2        v17.8h,  v3.4s,   #5
+.endif
+
+.if \ox && !\oy
+        sqrshrn         v18.4h,  v18.4s,  #5
+        smin            v18.4h,  v18.4h,  v15.4h
+.endif
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
+.if \oy
+        smin            v16.8h,  v16.8h,  v15.8h
+        smin            v17.8h,  v17.8h,  v15.8h
+        smax            v16.8h,  v16.8h,  v14.8h
+        smax            v17.8h,  v17.8h,  v14.8h
+.endif
+
+.if \ox && !\oy
+        smax            v18.4h,  v18.4h,  v14.4h
+.endif
+        ld1             {v10.8h, v11.8h},  [x1],  x2 // src
+.if \ox && !\oy
+        ins             v16.d[0], v18.d[0]
+.endif
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v1.8h,   v2.8h,   v3.8h
+        urshr           v0.8h,   v0.8h,   #1
+        urshr           v1.8h,   v1.8h,   #1
+.if !\csfl
+        smull           v2.4s,   v0.4h,   v8.4h
+        smull2          v3.4s,   v0.8h,   v8.8h
+        smull           v0.4s,   v1.4h,   v8.4h
+        smull2          v1.4s,   v1.8h,   v8.8h
+        smlal           v2.4s,   v10.4h,  v9.4h
+        smlal2          v3.4s,   v10.8h,  v9.8h
+        smlal           v0.4s,   v11.4h,  v9.4h
+        smlal2          v1.4s,   v11.8h,  v9.8h
+        shrn            v2.4h,   v2.4s,   #6
+        shrn2           v2.8h,   v3.4s,   #6
+        shrn            v3.4h,   v0.4s,   #6
+        shrn2           v3.8h,   v1.4s,   #6
+        add             v0.8h,   v2.8h,   v24.8h
+        add             v1.8h,   v3.8h,   v24.8h
+        movi            v2.8h,   #0
+        smin            v0.8h,   v0.8h,   v23.8h
+        smin            v1.8h,   v1.8h,   v23.8h
+        smax            v0.8h,   v0.8h,   v2.8h
+        smax            v1.8h,   v1.8h,   v2.8h
+.else
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v23.16b
+        and             v1.16b,  v1.16b,  v23.16b
+.endif
+
+        bl              gather16_neon
+
+        uxtl            v4.8h,   v6.8b            // scaling
+        uxtl2           v5.8h,   v6.16b
+
+        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
+        ushl            v5.8h,   v5.8h,   v29.8h
+
+        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
+        sqrdmulh        v17.8h,  v17.8h,  v5.8h
+
+        usqadd          v10.8h,  v16.8h           // *src + noise
+        usqadd          v11.8h,  v17.8h
+
+        umax            v0.8h,   v10.8h,  v30.8h
+        umax            v1.8h,   v11.8h,  v30.8h
+        umin            v0.8h,   v0.8h,   v31.8h
+        umin            v1.8h,   v1.8h,   v31.8h
+
+.if \oy
+        mov             v16.16b, v25.16b
+.endif
+        subs            w9,  w9,  #1
+.if \oy
+        mov             v25.16b, v26.16b
+        mov             v26.16b, v16.16b
+.endif
+        st1             {v0.8h, v1.8h},  [x0], x2 // dst
+        b.gt            1b
+
+.if \oy
+        cmp             w12, #0
+        mov             w9,  w12               // restore actual remaining h
+        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+        b               9f
+.endm
+        fguv_loop_sx1   0, 0, 0
+        fguv_loop_sx1   0, 0, 1
+        fguv_loop_sx1   0, 1, 0
+        fguv_loop_sx1   0, 1, 1
+        fguv_loop_sx1   1, 0, 0
+        fguv_loop_sx1   1, 0, 1
+        fguv_loop_sx1   1, 1, 0
+        fguv_loop_sx1   1, 1, 1
+
+9:
+        ldp             d14, d15, [sp, #64]
+        ldp             d12, d13, [sp, #48]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #16]
+        ldr             x30,      [sp], #80
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+L(fguv_loop_sx1_tbl):
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc