1 files changed, 3611 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
new file mode 100644
index 0000000000..1bfb12ebb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -0,0 +1,3611 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
+        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
+        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
+        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        // This difference requires a 17 bit range, and all bits are
+        // significant for the following multiplication.
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v27.4s
+        mul             \t0\().4s,  \t0\().4s,  v27.4s
+        mul             \d1\().4s,  \d1\().4s,  v27.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #4
+        sshr            \t0\().4s,  \t0\().4s,  #4
+        sshr            \d1\().4s,  \d1\().4s,  #4
+        sshr            \t1\().4s,  \t1\().4s,  #4
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
+        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+        ld1             {v27.16b}, [x6],  16
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        neg             v27.16b, v27.16b
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sxtl            v26.8h,  v27.8b
+        sxtl2           v27.8h,  v27.16b
+        sxtl            v24.4s,  v26.4h
+        sxtl2           v25.4s,  v26.8h
+        sxtl            v26.4s,  v27.4h
+        sxtl2           v27.4s,  v27.8h
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v24.4s
+        mul             \t0\().4s,  \t0\().4s,  v25.4s
+        mul             \d1\().4s,  \d1\().4s,  v26.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #6
+        sshr            \t0\().4s,  \t0\().4s,  #6
+        sshr            \d1\().4s,  \d1\().4s,  #6
+        sshr            \t1\().4s,  \t1\().4s,  #6
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
+        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+        clz             w4,  w4
+.ifnc \type, avg
+        dup             v31.8h,  \bdmax // bitdepth_max
+        movi            v30.8h,  #0
+.endif
+        clz             w7,  \bdmax
+        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+        mov             w9,  #1
+        mov             w8,  #-2*PREP_BIAS
+        lsl             w9,  w9,  w7    // 1 << intermediate_bits
+        add             w7,  w7,  #1
+        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
+        neg             w7,  w7         // -(intermediate_bits+1)
+        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
+        dup             v29.8h,   w7    // -(intermediate_bits+1)
+.else
+        mov             w8,  #PREP_BIAS
+        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
+        neg             w7,  w7         // -intermediate_bits
+        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
+        dup             v29.8h,  w7     // -intermediate_bits
+.endif
+.ifc \type, w_avg
+        dup             v27.4s,  w6
+        neg             v27.4s,  v27.4s
+.endif
+        adr             x7,  L(\type\()_tbl)
+        sub             w4,  w4,  #24
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        ldrh            w4,  [x7, x4, lsl #1]
+        sub             x7,  x7,  w4, uxtw
+        br              x7
+40:
+        AARCH64_VALID_JUMP_TARGET
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        subs            w5,  w5,  #4
+        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.d}[1],  [x7], x1
+        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               4b
+80:
+        AARCH64_VALID_JUMP_TARGET
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        st1             {v4.8h},  [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v5.8h},  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               8b
+16:
+        AARCH64_VALID_JUMP_TARGET
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h}, [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v6.8h, v7.8h}, [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               16b
+32:
+        AARCH64_VALID_JUMP_TARGET
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               32b
+640:
+        AARCH64_VALID_JUMP_TARGET
+        add             x7,  x0,  #64
+64:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               64b
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        add             x7,  x0,  #64
+        mov             x8,  #128
+        sub             x1,  x1,  #128
+128:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               128b
+0:
+        ret
+L(\type\()_tbl):
+        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -  640b
+        .hword L(\type\()_tbl) -   32b
+        .hword L(\type\()_tbl) -   16b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        clz             w9,  w4
+        adr             x10, L(w_mask_\type\()_tbl)
+        dup             v31.8h,  w8   // bitdepth_max
+        sub             w9,  w9,  #24
+        clz             w8,  w8       // clz(bitdepth_max)
+        ldrh            w9,  [x10,  x9,  lsl #1]
+        sub             x10, x10, w9,  uxtw
+        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+        mov             w9,  #PREP_BIAS*64
+        neg             w8,  w8       // -sh
+        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+        dup             v30.4s,  w9   // PREP_BIAS*64
+        dup             v29.4s,  w8   // -sh
+        dup             v0.8h,   w11
+.if \type == 444
+        movi            v1.16b,  #64
+.elseif \type == 422
+        dup             v2.8b,   w7
+        movi            v3.8b,   #129
+        sub             v3.8b,   v3.8b,   v2.8b
+.elseif \type == 420
+        dup             v2.8h,   w7
+        movi            v3.8h,   #1, lsl #8
+        sub             v3.8h,   v3.8h,   v2.8h
+.endif
+        add             x12,  x0,  x1
+        lsl             x1,   x1,  #1
+        br              x10
+4:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+        subs            w5,  w5,  #4
+        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v7.8h
+        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v6.8h,   v4.8h
+        ssubl           v18.4s,  v7.4h,   v5.4h
+        ssubl2          v19.4s,  v7.8h,   v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
+        sshll           v6.4s,   v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v6.4s,   v6.4s,   v30.4s
+        add             v7.4s,   v7.4s,   v30.4s
+        uxtl            v22.4s,  v20.4h
+        uxtl2           v23.4s,  v20.8h
+        uxtl            v24.4s,  v21.4h
+        uxtl2           v25.4s,  v21.8h
+        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
+        mla             v5.4s,   v17.4s,  v23.4s
+        mla             v6.4s,   v18.4s,  v24.4s
+        mla             v7.4s,   v19.4s,  v25.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v6.4s,   v6.4s,   v29.4s
+        srshl           v7.4s,   v7.4s,   v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v6.4s
+        sqxtun2         v5.8h,   v7.4s
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+.if \type == 444
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
+        sub             v20.16b, v1.16b,  v20.16b // m
+        st1             {v20.16b}, [x6], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        xtn             v20.8b,  v20.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        st1             {v20.8b}, [x6], #8
+.elseif \type == 420
+        trn1            v24.2d,  v20.2d,  v21.2d
+        trn2            v25.2d,  v20.2d,  v21.2d
+        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.s}[0], [x6], #4
+.endif
+        st1             {v4.d}[0],  [x0],  x1
+        st1             {v4.d}[1],  [x12], x1
+        st1             {v5.d}[0],  [x0],  x1
+        st1             {v5.d}[1],  [x12], x1
+        b.gt            4b
+        ret
+8:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
+        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
+        subs            w5,  w5,  #2
+        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v7.8h
+        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v6.8h,   v4.8h
+        ssubl           v18.4s,  v7.4h,   v5.4h
+        ssubl2          v19.4s,  v7.8h,   v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
+        sshll           v6.4s,   v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v6.4s,   v6.4s,   v30.4s
+        add             v7.4s,   v7.4s,   v30.4s
+        uxtl            v22.4s,  v20.4h
+        uxtl2           v23.4s,  v20.8h
+        uxtl            v24.4s,  v21.4h
+        uxtl2           v25.4s,  v21.8h
+        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
+        mla             v5.4s,   v17.4s,  v23.4s
+        mla             v6.4s,   v18.4s,  v24.4s
+        mla             v7.4s,   v19.4s,  v25.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v6.4s,   v6.4s,   v29.4s
+        srshl           v7.4s,   v7.4s,   v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v6.4s
+        sqxtun2         v5.8h,   v7.4s
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+.if \type == 444
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
+        sub             v20.16b, v1.16b,  v20.16b // m
+        st1             {v20.16b}, [x6], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        xtn             v20.8b,  v20.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        st1             {v20.8b}, [x6], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.s}[0], [x6], #4
+.endif
+        st1             {v4.8h}, [x0],  x1
+        st1             {v5.8h}, [x12], x1
+        b.gt            8b
+        ret
+1280:
+640:
+320:
+160:
+        AARCH64_VALID_JUMP_TARGET
+        mov             w11, w4
+        sub             x1,  x1,  w4,  uxtw #1
+.if \type == 444
+        add             x10, x6,  w4,  uxtw
+.elseif \type == 422
+        add             x10, x6,  x11, lsr #1
+.endif
+        add             x9,  x3,  w4,  uxtw #1
+        add             x7,  x2,  w4,  uxtw #1
+161:
+        mov             w8,  w4
+16:
+        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
+        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
+        ld1             {v6.8h,   v7.8h},  [x7], #32
+        ld1             {v18.8h,  v19.8h}, [x9], #32
+        subs            w8,  w8,  #16
+        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
+        sabd            v21.8h,  v5.8h,   v17.8h
+        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v23.4s,  v16.8h,  v4.8h
+        ssubl           v24.4s,  v17.4h,  v5.4h
+        ssubl2          v25.4s,  v17.8h,  v5.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
+        sshll           v26.4s,  v5.4h,   #6
+        sshll2          v5.4s,   v4.8h,   #6
+        sshll           v4.4s,   v4.4h,   #6
+        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v21.8h,  v21.8h,  #10
+        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
+        add             v5.4s,   v5.4s,   v30.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        add             v27.4s,  v27.4s,  v30.4s
+        uxtl            v16.4s,  v20.4h
+        uxtl2           v17.4s,  v20.8h
+        uxtl            v28.4s,  v21.4h
+        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
+        uxtl2           v16.4s,  v21.8h
+        mla             v5.4s,   v23.4s,  v17.4s
+        mla             v26.4s,  v24.4s,  v28.4s
+        mla             v27.4s,  v25.4s,  v16.4s
+        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v5.4s,   v5.4s,   v29.4s
+        srshl           v26.4s,  v26.4s,  v29.4s
+        srshl           v27.4s,  v27.4s,  v29.4s
+        sqxtun          v4.4h,   v4.4s            // iclip_pixel
+        sqxtun2         v4.8h,   v5.4s
+        sqxtun          v5.4h,   v26.4s
+        sqxtun2         v5.8h,   v27.4s
+
+        // Start of other half
+        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
+        sabd            v23.8h,  v7.8h,   v19.8h
+
+        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
+        umin            v5.8h,   v5.8h,   v31.8h
+
+        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
+        ssubl2          v17.4s,  v18.8h,  v6.8h
+        ssubl           v18.4s,  v19.4h,  v7.4h
+        ssubl2          v19.4s,  v19.8h,  v7.8h
+        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
+        uqsub           v23.8h,  v0.8h,   v23.8h
+        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
+        sshll2          v25.4s,  v6.8h,   #6
+        sshll           v26.4s,  v7.4h,   #6
+        sshll2          v27.4s,  v7.8h,   #6
+        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
+        ushr            v23.8h,  v23.8h,  #10
+        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
+        add             v25.4s,  v25.4s,  v30.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        add             v27.4s,  v27.4s,  v30.4s
+        uxtl            v6.4s,   v22.4h
+        uxtl2           v7.4s,   v22.8h
+        uxtl            v28.4s,  v23.4h
+        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
+        uxtl2           v6.4s,   v23.8h
+        mla             v25.4s,  v17.4s,  v7.4s
+        mla             v26.4s,  v18.4s,  v28.4s
+        mla             v27.4s,  v19.4s,  v6.4s
+        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+        srshl           v25.4s,  v25.4s,  v29.4s
+        srshl           v26.4s,  v26.4s,  v29.4s
+        srshl           v27.4s,  v27.4s,  v29.4s
+        sqxtun          v6.4h,   v24.4s           // iclip_pixel
+        sqxtun2         v6.8h,   v25.4s
+        sqxtun          v7.4h,   v26.4s
+        sqxtun2         v7.8h,   v27.4s
+        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
+        umin            v7.8h,   v7.8h,   v31.8h
+.if \type == 444
+        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
+        uzp1            v21.16b, v22.16b, v23.16b
+        sub             v20.16b, v1.16b,  v20.16b // m
+        sub             v21.16b, v1.16b,  v21.16b
+        st1             {v20.16b}, [x6],  #16
+        st1             {v21.16b}, [x10], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
+        addp            v21.8h,  v22.8h,  v23.8h
+        xtn             v20.8b,  v20.8h
+        xtn             v21.8b,  v21.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+        uhsub           v21.8b,  v3.8b,   v21.8b
+        st1             {v20.8b}, [x6],  #8
+        st1             {v21.8b}, [x10], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
+        add             v21.8h,  v21.8h,  v23.8h
+        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
+        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
+        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+        st1             {v20.8b}, [x6], #8
+.endif
+        st1             {v4.8h, v5.8h}, [x0],  #32
+        st1             {v6.8h, v7.8h}, [x12], #32
+        b.gt            16b
+        subs            w5,  w5,  #2
+        add             x2,  x2,  w4,  uxtw #1
+        add             x3,  x3,  w4,  uxtw #1
+        add             x7,  x7,  w4,  uxtw #1
+        add             x9,  x9,  w4,  uxtw #1
+.if \type == 444
+        add             x6,  x6,  w4,  uxtw
+        add             x10, x10, w4,  uxtw
+.elseif \type == 422
+        add             x6,  x6,  x11, lsr #1
+        add             x10, x10, x11, lsr #1
+.endif
+        add             x0,  x0,  x1
+        add             x12, x12, x1
+        b.gt            161b
+        ret
+L(w_mask_\type\()_tbl):
+        .hword L(w_mask_\type\()_tbl) - 1280b
+        .hword L(w_mask_\type\()_tbl) -  640b
+        .hword L(w_mask_\type\()_tbl) -  320b
+        .hword L(w_mask_\type\()_tbl) -  160b
+        .hword L(w_mask_\type\()_tbl) -    8b
+        .hword L(w_mask_\type\()_tbl) -    4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_16bpc_neon, export=1
+        adr             x6,  L(blend_tbl)
+        clz             w3,  w3
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        add             x8,  x0,  x1
+        br              x6
+40:
+        AARCH64_VALID_JUMP_TARGET
+        lsl             x1,  x1,  #1
+4:
+        ld1             {v2.8b},   [x5], #8
+        ld1             {v1.8h},   [x2], #16
+        ld1             {v0.d}[0], [x0]
+        neg             v2.8b,   v2.8b            // -m
+        subs            w4,  w4,  #2
+        ld1             {v0.d}[1], [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x8], x1
+        b.gt            4b
+        ret
+80:
+        AARCH64_VALID_JUMP_TARGET
+        lsl             x1,  x1,  #1
+8:
+        ld1             {v4.16b},       [x5], #16
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        neg             v5.16b,  v4.16b           // -m
+        ld1             {v0.8h},   [x0]
+        ld1             {v1.8h},   [x8]
+        sxtl            v4.8h,   v5.8b
+        sxtl2           v5.8h,   v5.16b
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+        shl             v5.8h,   v5.8h,   #9
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        subs            w4,  w4,  #2
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v5.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
+        b.gt            8b
+        ret
+160:
+        AARCH64_VALID_JUMP_TARGET
+        lsl             x1,  x1,  #1
+16:
+        ld1             {v16.16b, v17.16b},           [x5], #32
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        subs            w4,  w4,  #2
+        neg             v18.16b, v16.16b          // -m
+        neg             v19.16b, v17.16b
+        ld1             {v0.8h, v1.8h}, [x0]
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        ld1             {v2.8h, v3.8h}, [x8]
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        shl             v18.8h,  v18.8h,  #9
+        shl             v19.8h,  v19.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v17.8h
+        sqrdmulh        v6.8h,   v6.8h,   v18.8h
+        sqrdmulh        v7.8h,   v7.8h,   v19.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v2.8h, v3.8h}, [x8], x1
+        b.gt            16b
+        ret
+32:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v16.16b, v17.16b},           [x5], #32
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        subs            w4,  w4,  #1
+        neg             v18.16b, v16.16b          // -m
+        neg             v19.16b, v17.16b
+        sxtl            v16.8h,  v18.8b
+        sxtl2           v17.8h,  v18.16b
+        sxtl            v18.8h,  v19.8b
+        sxtl2           v19.8h,  v19.16b
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        shl             v18.8h,  v18.8h,  #9
+        shl             v19.8h,  v19.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v17.8h
+        sqrdmulh        v6.8h,   v6.8h,   v18.8h
+        sqrdmulh        v7.8h,   v7.8h,   v19.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        b.gt            32b
+        ret
+L(blend_tbl):
+        .hword L(blend_tbl) -  32b
+        .hword L(blend_tbl) - 160b
+        .hword L(blend_tbl) -  80b
+        .hword L(blend_tbl) -  40b
+endfunc
+
+function blend_h_16bpc_neon, export=1
+        adr             x6,  L(blend_h_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w4,  uxtw
+        sub             w4,  w4,  w4,  lsr #2
+        clz             w7,  w3
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w7,  w7,  #24
+        ldrh            w7,  [x6,  x7,  lsl #1]
+        sub             x6,  x6,  w7, uxtw
+        br              x6
+2:
+        AARCH64_VALID_JUMP_TARGET
+        ld2r            {v2.8b, v3.8b}, [x5], #2
+        ld1             {v1.4h},        [x2], #8
+        ext             v2.8b,   v2.8b,   v3.8b,   #6
+        subs            w4,  w4,  #2
+        neg             v2.8b,   v2.8b            // -m
+        ld1             {v0.s}[0], [x0]
+        ld1             {v0.s}[1], [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.4h,   v2.4h,   #9      // -m << 9
+        sub             v1.4h,   v0.4h,   v1.4h   // a - b
+        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
+        add             v0.4h,   v0.4h,   v1.4h
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x8], x1
+        b.gt            2b
+        ret
+4:
+        AARCH64_VALID_JUMP_TARGET
+        ld2r            {v2.8b, v3.8b}, [x5], #2
+        ld1             {v1.8h},        [x2], #16
+        ext             v2.8b,   v2.8b,   v3.8b,   #4
+        subs            w4,  w4,  #2
+        neg             v2.8b,   v2.8b            // -m
+        ld1             {v0.d}[0],   [x0]
+        ld1             {v0.d}[1],   [x8]
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.d}[0], [x0], x1
+        st1             {v0.d}[1], [x8], x1
+        b.gt            4b
+        ret
+8:
+        AARCH64_VALID_JUMP_TARGET
+        ld2r            {v4.8b, v5.8b}, [x5], #2
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        neg             v4.8b,   v4.8b            // -m
+        neg             v5.8b,   v5.8b
+        ld1             {v0.8h}, [x0]
+        subs            w4,  w4,  #2
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        ld1             {v1.8h}, [x8]
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+        shl             v5.8h,   v5.8h,   #9
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v5.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
+        b.gt            8b
+        ret
+16:
+        AARCH64_VALID_JUMP_TARGET
+        ld2r            {v16.8b, v17.8b}, [x5], #2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        neg             v16.8b,  v16.8b           // -m
+        neg             v17.8b,  v17.8b
+        ld1             {v0.8h, v1.8h},  [x0]
+        ld1             {v2.8h, v3.8h},  [x8]
+        subs            w4,  w4,  #2
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.8h,  v17.8h,  #9
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.8h,   v1.8h,   v5.8h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.8h,   v3.8h,   v7.8h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.8h,   v5.8h,   v16.8h
+        sqrdmulh        v6.8h,   v6.8h,   v17.8h
+        sqrdmulh        v7.8h,   v7.8h,   v17.8h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.8h,   v1.8h,   v5.8h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.8h,   v3.8h,   v7.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v2.8h, v3.8h}, [x8], x1
+        b.gt            16b
+        ret
+1280:
+640:
+320:
+        AARCH64_VALID_JUMP_TARGET
+        sub             x1,  x1,  w3,  uxtw #1
+        add             x7,  x2,  w3,  uxtw #1
+321:
+        ld2r            {v24.8b, v25.8b}, [x5], #2
+        mov             w6,  w3
+        neg             v24.8b,  v24.8b           // -m
+        neg             v25.8b,  v25.8b
+        sxtl            v24.8h,  v24.8b
+        sxtl            v25.8h,  v25.8b
+        shl             v24.8h,  v24.8h,  #9      // -m << 9
+        shl             v25.8h,  v25.8h,  #9
+32:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
+        subs            w6,  w6,  #32
+        sub             v16.8h,  v0.8h,   v16.8h  // a - b
+        sub             v17.8h,  v1.8h,   v17.8h
+        sub             v18.8h,  v2.8h,   v18.8h
+        sub             v19.8h,  v3.8h,   v19.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
+        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v17.8h,  v17.8h,  v24.8h
+        sqrdmulh        v18.8h,  v18.8h,  v24.8h
+        sqrdmulh        v19.8h,  v19.8h,  v24.8h
+        sub             v20.8h,  v4.8h,   v20.8h  // a - b
+        sub             v21.8h,  v5.8h,   v21.8h
+        sub             v22.8h,  v6.8h,   v22.8h
+        sub             v23.8h,  v7.8h,   v23.8h
+        add             v0.8h,   v0.8h,   v16.8h
+        add             v1.8h,   v1.8h,   v17.8h
+        add             v2.8h,   v2.8h,   v18.8h
+        add             v3.8h,   v3.8h,   v19.8h
+        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v21.8h,  v21.8h,  v25.8h
+        sqrdmulh        v22.8h,  v22.8h,  v25.8h
+        sqrdmulh        v23.8h,  v23.8h,  v25.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
+        add             v4.8h,   v4.8h,   v20.8h
+        add             v5.8h,   v5.8h,   v21.8h
+        add             v6.8h,   v6.8h,   v22.8h
+        add             v7.8h,   v7.8h,   v23.8h
+        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
+        b.gt            32b
+        subs            w4,  w4,  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        add             x2,  x2,  w3,  uxtw #1
+        add             x7,  x7,  w3,  uxtw #1
+        b.gt            321b
+        ret
+L(blend_h_tbl):
+        .hword L(blend_h_tbl) - 1280b
+        .hword L(blend_h_tbl) -  640b
+        .hword L(blend_h_tbl) -  320b
+        .hword L(blend_h_tbl) -   16b
+        .hword L(blend_h_tbl) -    8b
+        .hword L(blend_h_tbl) -    4b
+        .hword L(blend_h_tbl) -    2b
+endfunc
+
+function blend_v_16bpc_neon, export=1
+        adr             x6,  L(blend_v_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w3,  uxtw
+        clz             w3,  w3
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        br              x6
+20:
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v2.8b}, [x5]
+        neg             v2.8b,   v2.8b            // -m
+        sxtl            v2.8h,   v2.8b
+        shl             v2.4h,   v2.4h,   #9      // -m << 9
+2:
+        ld1             {v1.s}[0], [x2], #4
+        ld1             {v0.h}[0], [x0]
+        subs            w4,  w4,  #2
+        ld1             {v1.h}[1], [x2]
+        ld1             {v0.h}[1], [x8]
+        add             x2,  x2,  #4
+        sub             v1.4h,   v0.4h,   v1.4h   // a - b
+        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
+        add             v0.4h,   v0.4h,   v1.4h
+        st1             {v0.h}[0], [x0],  x1
+        st1             {v0.h}[1], [x8],  x1
+        b.gt            2b
+        ret
+40:
+        AARCH64_VALID_JUMP_TARGET
+        ld1r            {v2.2s}, [x5]
+        sub             x1,  x1,  #4
+        neg             v2.8b,   v2.8b            // -m
+        sxtl            v2.8h,   v2.8b
+        shl             v2.8h,   v2.8h,   #9      // -m << 9
+4:
+        ld1             {v1.8h},   [x2], #16
+        ld1             {v0.d}[0], [x0]
+        ld1             {v0.d}[1], [x8]
+        subs            w4,  w4,  #2
+        sub             v1.8h,   v0.8h,   v1.8h   // a - b
+        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
+        add             v0.8h,   v0.8h,   v1.8h
+        st1             {v0.s}[0], [x0], #4
+        st1             {v0.s}[2], [x8], #4
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[6], [x8], x1
+        b.gt            4b
+        ret
+80:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v4.8b}, [x5]
+        sub             x1,  x1,  #8
+        neg             v4.8b,   v4.8b            // -m
+        sxtl            v4.8h,   v4.8b
+        shl             v4.8h,   v4.8h,   #9      // -m << 9
+8:
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        ld1             {v0.8h}, [x0]
+        ld1             {v1.8h}, [x8]
+        subs            w4,  w4,  #2
+        sub             v2.8h,   v0.8h,   v2.8h   // a - b
+        sub             v3.8h,   v1.8h,   v3.8h
+        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v3.8h,   v3.8h,   v4.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v1.8h,   v1.8h,   v3.8h
+        st1             {v0.d}[0], [x0], #8
+        st1             {v1.d}[0], [x8], #8
+        st1             {v0.s}[2], [x0], x1
+        st1             {v1.s}[2], [x8], x1
+        b.gt            8b
+        ret
+160:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v16.16b}, [x5]
+        sub             x1,  x1,  #16
+        neg             v17.16b, v16.16b          // -m
+        sxtl            v16.8h,  v17.8b
+        sxtl2           v17.8h,  v17.16b
+        shl             v16.8h,  v16.8h,  #9      // -m << 9
+        shl             v17.4h,  v17.4h,  #9
+16:
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        ld1             {v0.8h, v1.8h}, [x0]
+        subs            w4,  w4,  #2
+        ld1             {v2.8h, v3.8h}, [x8]
+        sub             v4.8h,   v0.8h,   v4.8h   // a - b
+        sub             v5.4h,   v1.4h,   v5.4h
+        sub             v6.8h,   v2.8h,   v6.8h
+        sub             v7.4h,   v3.4h,   v7.4h
+        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v5.4h,   v5.4h,   v17.4h
+        sqrdmulh        v6.8h,   v6.8h,   v16.8h
+        sqrdmulh        v7.4h,   v7.4h,   v17.4h
+        add             v0.8h,   v0.8h,   v4.8h
+        add             v1.4h,   v1.4h,   v5.4h
+        add             v2.8h,   v2.8h,   v6.8h
+        add             v3.4h,   v3.4h,   v7.4h
+        st1             {v0.8h}, [x0], #16
+        st1             {v2.8h}, [x8], #16
+        st1             {v1.4h}, [x0], x1
+        st1             {v3.4h}, [x8], x1
+        b.gt            16b
+        ret
+320:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v24.16b, v25.16b},  [x5]
+        neg             v26.16b, v24.16b          // -m
+        neg             v27.8b,  v25.8b
+        sxtl            v24.8h,  v26.8b
+        sxtl2           v25.8h,  v26.16b
+        sxtl            v26.8h,  v27.8b
+        shl             v24.8h,  v24.8h,  #9      // -m << 9
+        shl             v25.8h,  v25.8h,  #9
+        shl             v26.8h,  v26.8h,  #9
+32:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
+        subs            w4,  w4,  #2
+        sub             v16.8h,  v0.8h,   v16.8h  // a - b
+        sub             v17.8h,  v1.8h,   v17.8h
+        sub             v18.8h,  v2.8h,   v18.8h
+        sub             v20.8h,  v4.8h,   v20.8h
+        sub             v21.8h,  v5.8h,   v21.8h
+        sub             v22.8h,  v6.8h,   v22.8h
+        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
+        sqrdmulh        v17.8h,  v17.8h,  v25.8h
+        sqrdmulh        v18.8h,  v18.8h,  v26.8h
+        sqrdmulh        v20.8h,  v20.8h,  v24.8h
+        sqrdmulh        v21.8h,  v21.8h,  v25.8h
+        sqrdmulh        v22.8h,  v22.8h,  v26.8h
+        add             v0.8h,   v0.8h,   v16.8h
+        add             v1.8h,   v1.8h,   v17.8h
+        add             v2.8h,   v2.8h,   v18.8h
+        add             v4.8h,   v4.8h,   v20.8h
+        add             v5.8h,   v5.8h,   v21.8h
+        add             v6.8h,   v6.8h,   v22.8h
+        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
+        b.gt            32b
+        ret
+L(blend_v_tbl):
+        .hword L(blend_v_tbl) - 320b
+        .hword L(blend_v_tbl) - 160b
+        .hword L(blend_v_tbl) -  80b
+        .hword L(blend_v_tbl) -  40b
+        .hword L(blend_v_tbl) -  20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+        adr             x10, L(put_tbl)
+        ldrh            w9, [x10, x9, lsl #1]
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+2:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v0.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.s}[0], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        b.gt            2b
+        ret
+4:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v0.4h}, [x2], x3
+        ld1             {v1.4h}, [x2], x3
+        subs            w5,  w5,  #2
+        st1             {v0.4h}, [x0], x1
+        st1             {v1.4h}, [x0], x1
+        b.gt            4b
+        ret
+80:
+        AARCH64_VALID_JUMP_TARGET
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        add             x9,  x2,  x3
+        lsl             x3,  x3,  #1
+8:
+        ld1             {v0.8h}, [x2], x3
+        ld1             {v1.8h}, [x9], x3
+        subs            w5,  w5,  #2
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
+        b.gt            8b
+        ret
+16:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        subs            w5,  w5,  #1
+        stp             x8,  x9,  [x0, #16]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            16b
+        ret
+32:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             x6,  x7,  [x2]
+        ldp             x8,  x9,  [x2, #16]
+        stp             x6,  x7,  [x0]
+        ldp             x10, x11, [x2, #32]
+        stp             x8,  x9,  [x0, #16]
+        subs            w5,  w5,  #1
+        ldp             x12, x13, [x2, #48]
+        stp             x10, x11, [x0, #32]
+        stp             x12, x13, [x0, #48]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            32b
+        ret
+64:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x2]
+        ldp             q2,  q3,  [x2, #32]
+        stp             q0,  q1,  [x0]
+        ldp             q4,  q5,  [x2, #64]
+        stp             q2,  q3,  [x0, #32]
+        ldp             q6,  q7,  [x2, #96]
+        subs            w5,  w5,  #1
+        stp             q4,  q5,  [x0, #64]
+        stp             q6,  q7,  [x0, #96]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            64b
+        ret
+128:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x2]
+        ldp             q2,  q3,  [x2, #32]
+        stp             q0,  q1,  [x0]
+        ldp             q4,  q5,  [x2, #64]
+        stp             q2,  q3,  [x0, #32]
+        ldp             q6,  q7,  [x2, #96]
+        subs            w5,  w5,  #1
+        stp             q4,  q5,  [x0, #64]
+        ldp             q16, q17, [x2, #128]
+        stp             q6,  q7,  [x0, #96]
+        ldp             q18, q19, [x2, #160]
+        stp             q16, q17, [x0, #128]
+        ldp             q20, q21, [x2, #192]
+        stp             q18, q19, [x0, #160]
+        ldp             q22, q23, [x2, #224]
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.gt            128b
+        ret
+
+L(put_tbl):
+        .hword L(put_tbl) - 128b
+        .hword L(put_tbl) -  64b
+        .hword L(put_tbl) -  32b
+        .hword L(put_tbl) -  16b
+        .hword L(put_tbl) -  80b
+        .hword L(put_tbl) -   4b
+        .hword L(put_tbl) -   2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+        adr             x10, L(prep_tbl)
+        ldrh            w9, [x10, x9, lsl #1]
+        dup             v31.8h,  w7   // intermediate_bits
+        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+40:
+        AARCH64_VALID_JUMP_TARGET
+        add             x9,  x1,  x2
+        lsl             x2,  x2,  #1
+4:
+        ld1             {v0.d}[0], [x1], x2
+        ld1             {v0.d}[1], [x9], x2
+        subs            w4,  w4,  #2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        st1             {v0.8h}, [x0], #16
+        b.gt            4b
+        ret
+80:
+        AARCH64_VALID_JUMP_TARGET
+        add             x9,  x1,  x2
+        lsl             x2,  x2,  #1
+8:
+        ld1             {v0.8h}, [x1], x2
+        ld1             {v1.8h}, [x9], x2
+        subs            w4,  w4,  #2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            8b
+        ret
+16:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x1]
+        add             x1,  x1,  x2
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1]
+        add             x1,  x1,  x2
+        subs            w4,  w4,  #2
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        b.gt            16b
+        ret
+32:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x1]
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        add             x1,  x1,  x2
+        sshl            v1.8h,   v1.8h,   v31.8h
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        subs            w4,  w4,  #1
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        b.gt            32b
+        ret
+64:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x1]
+        subs            w4,  w4,  #1
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        sshl            v1.8h,   v1.8h,   v31.8h
+        ldp             q4,  q5,  [x1, #64]
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        ldp             q6,  q7,  [x1, #96]
+        add             x1,  x1,  x2
+        sshl            v4.8h,   v4.8h,   v31.8h
+        sshl            v5.8h,   v5.8h,   v31.8h
+        sshl            v6.8h,   v6.8h,   v31.8h
+        sshl            v7.8h,   v7.8h,   v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        stp             q0,  q1,  [x0]
+        sub             v4.8h,   v4.8h,   v30.8h
+        sub             v5.8h,   v5.8h,   v30.8h
+        stp             q2,  q3,  [x0, #32]
+        sub             v6.8h,   v6.8h,   v30.8h
+        sub             v7.8h,   v7.8h,   v30.8h
+        stp             q4,  q5,  [x0, #64]
+        stp             q6,  q7,  [x0, #96]
+        add             x0,  x0,  x8
+        b.gt            64b
+        ret
+128:
+        AARCH64_VALID_JUMP_TARGET
+        ldp             q0,  q1,  [x1]
+        subs            w4,  w4,  #1
+        sshl            v0.8h,   v0.8h,   v31.8h
+        ldp             q2,  q3,  [x1, #32]
+        sshl            v1.8h,   v1.8h,   v31.8h
+        ldp             q4,  q5,  [x1, #64]
+        sshl            v2.8h,   v2.8h,   v31.8h
+        sshl            v3.8h,   v3.8h,   v31.8h
+        ldp             q6,  q7,  [x1, #96]
+        sshl            v4.8h,   v4.8h,   v31.8h
+        sshl            v5.8h,   v5.8h,   v31.8h
+        ldp             q16, q17, [x1, #128]
+        sshl            v6.8h,   v6.8h,   v31.8h
+        sshl            v7.8h,   v7.8h,   v31.8h
+        ldp             q18, q19, [x1, #160]
+        sshl            v16.8h,  v16.8h,  v31.8h
+        sshl            v17.8h,  v17.8h,  v31.8h
+        ldp             q20, q21, [x1, #192]
+        sshl            v18.8h,  v18.8h,  v31.8h
+        sshl            v19.8h,  v19.8h,  v31.8h
+        ldp             q22, q23, [x1, #224]
+        add             x1,  x1,  x2
+        sshl            v20.8h,  v20.8h,  v31.8h
+        sshl            v21.8h,  v21.8h,  v31.8h
+        sshl            v22.8h,  v22.8h,  v31.8h
+        sshl            v23.8h,  v23.8h,  v31.8h
+        sub             v0.8h,   v0.8h,   v30.8h
+        sub             v1.8h,   v1.8h,   v30.8h
+        sub             v2.8h,   v2.8h,   v30.8h
+        sub             v3.8h,   v3.8h,   v30.8h
+        stp             q0,  q1,  [x0]
+        sub             v4.8h,   v4.8h,   v30.8h
+        sub             v5.8h,   v5.8h,   v30.8h
+        stp             q2,  q3,  [x0, #32]
+        sub             v6.8h,   v6.8h,   v30.8h
+        sub             v7.8h,   v7.8h,   v30.8h
+        stp             q4,  q5,  [x0, #64]
+        sub             v16.8h,  v16.8h,  v30.8h
+        sub             v17.8h,  v17.8h,  v30.8h
+        stp             q6,  q7,  [x0, #96]
+        sub             v18.8h,  v18.8h,  v30.8h
+        sub             v19.8h,  v19.8h,  v30.8h
+        stp             q16, q17, [x0, #128]
+        sub             v20.8h,  v20.8h,  v30.8h
+        sub             v21.8h,  v21.8h,  v30.8h
+        stp             q18, q19, [x0, #160]
+        sub             v22.8h,  v22.8h,  v30.8h
+        sub             v23.8h,  v23.8h,  v30.8h
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x0,  x0,  x8
+        b.gt            128b
+        ret
+
+L(prep_tbl):
+        .hword L(prep_tbl) - 128b
+        .hword L(prep_tbl) -  64b
+        .hword L(prep_tbl) -  32b
+        .hword L(prep_tbl) -  16b
+        .hword L(prep_tbl) -  80b
+        .hword L(prep_tbl) -  40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}[0], [\s0], \strd
+        ld1             {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}[0], [\s0], \strd
+        ld1             {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        ld1             {\d0\wd}, [\s0], \strd
+        ld1             {\d1\wd}, [\s1], \strd
+.ifnb \d2
+        ld1             {\d2\wd}, [\s0], \strd
+        ld1             {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+        ld1             {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+        ld1             {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+        trn1            \r0\wd, \r0\wd, \r1\wd
+        trn1            \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+        trn1            \r2\wd, \r2\wd, \r3\wd
+        trn1            \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+        umin            \r0\wd,  \r0\wd,  \c\wd
+.ifnb \r1
+        umin            \r1\wd,  \r1\wd,  \c\wd
+.endif
+.ifnb \r2
+        umin            \r2\wd,  \r2\wd,  \c\wd
+        umin            \r3\wd,  \r3\wd,  \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+        sub             \r0\wd,  \r0\wd,  \c\wd
+.ifnb \r1
+        sub             \r1\wd,  \r1\wd,  \c\wd
+.endif
+.ifnb \r2
+        sub             \r2\wd,  \r2\wd,  \c\wd
+        sub             \r3\wd,  \r3\wd,  \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+        smull           \d\().4s,  \s0\().4h,  v0.h[0]
+        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
+        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull           \d\().4s,  \s0\().4h,  v0.h[0]
+        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
+        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
+        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
+        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
+        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
+        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
+        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
+        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
+.ifnb \r1
+        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
+.endif
+.ifnb \r2
+        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
+        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
+.ifnb \r2
+        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
+        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
+.ifnb \r2
+        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
+        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+        st1             {\reg\().s}[0], [x0], \strd
+        st1             {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+        st1             {\reg\().s}[2], [x0], \strd
+        st1             {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin_h          v31, .8h, \r0, \r2
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
+.endif
+        st_d            \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+        st1             {\r0\wd}, [x0], \strd
+        st1             {\r1\wd}, [x9], \strd
+.ifnb \r2
+        st1             {\r2\wd}, [x0], \strd
+        st1             {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+        st1             {\r4\wd}, [x0], \strd
+        st1             {\r5\wd}, [x9], \strd
+        st1             {\r6\wd}, [x0], \strd
+        st1             {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin_h          v31, .8h, \r0, \r2
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
+.endif
+        st_8h           \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+        sqrshrun_h      6,   \r0, \r1, \r2, \r3
+        umin            \r0\().8h, \r0\().8h, v31.8h
+        umin            \r1\().8h, \r2\().8h, v31.8h
+.else
+        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
+        xtn_h           \r0, \r1, \r2, \r3
+        sub             \r0\().8h, \r0\().8h, v29.8h
+        sub             \r1\().8h, \r2\().8h, v29.8h
+.endif
+        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+        mov             w9,  \type_h
+        mov             w10, \type_v
+        b               \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+        ldr             w8,  [sp]
+.endif
+        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx,  \mx, w11
+        mul             \my,  \my, w11
+        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
+        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        dup             v31.8h,  \bdmax        // bitdepth_max
+        clz             \bdmax,  \bdmax
+        clz             w9,  \w
+        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
+        mov             w12, #6
+        tst             \mx, #(0x7f << 14)
+        sub             w9,  w9,  #24
+        add             w13, w12, \bdmax       // 6 + intermediate_bits
+        sub             w12, w12, \bdmax       // 6 - intermediate_bits
+        movrel          x11, X(mc_subpel_filters), -8
+        b.ne            L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        b.ne            L(\type\()_8tap_v)
+        b               \type\()_neon
+
+L(\type\()_8tap_h):
+        cmp             \w,   #4
+        ubfx            w10,  \mx, #7, #7
+        and             \mx,  \mx, #0x7f
+        b.le            4f
+        mov             \mx,  w10
+4:
+        tst             \my,  #(0x7f << 14)
+        add             \xmx, x11, \mx, uxtw #3
+        b.ne            L(\type\()_8tap_hv)
+
+        adr             x10, L(\type\()_8tap_h_tbl)
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.ifc \type, put
+        dup             v29.8h,  \bdmax        // intermediate_bits
+.else
+        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v29.8h,  v29.8h        // -intermediate_bits
+.endif
+        br              x10
+
+20:     // 2xN h
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+2:
+        ld1             {v4.8h},  [\src], \s_strd
+        ld1             {v6.8h},  [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v4.16b,  #2
+        ext             v7.16b,  v6.16b,  v6.16b,  #2
+        subs            \h,  \h,  #2
+        trn1            v3.2s,   v4.2s,   v6.2s
+        trn2            v6.2s,   v4.2s,   v6.2s
+        trn1            v4.2s,   v5.2s,   v7.2s
+        trn2            v7.2s,   v5.2s,   v7.2s
+        smull           v3.4s,   v3.4h,   v0.h[0]
+        smlal           v3.4s,   v4.4h,   v0.h[1]
+        smlal           v3.4s,   v6.4h,   v0.h[2]
+        smlal           v3.4s,   v7.4h,   v0.h[3]
+        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
+        umin            v3.4h,   v3.4h,   v31.4h
+        st1             {v3.s}[0], [\dst], \d_strd
+        st1             {v3.s}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        AARCH64_VALID_JUMP_TARGET
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0], [\xmx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+4:
+        ld1             {v16.8h}, [\src], \s_strd
+        ld1             {v20.8h}, [\sr2], \s_strd
+        ext             v17.16b, v16.16b, v16.16b, #2
+        ext             v18.16b, v16.16b, v16.16b, #4
+        ext             v19.16b, v16.16b, v16.16b, #6
+        ext             v21.16b, v20.16b, v20.16b, #2
+        ext             v22.16b, v20.16b, v20.16b, #4
+        ext             v23.16b, v20.16b, v20.16b, #6
+        subs            \h,  \h,  #2
+        smull           v16.4s,  v16.4h,  v0.h[0]
+        smlal           v16.4s,  v17.4h,  v0.h[1]
+        smlal           v16.4s,  v18.4h,  v0.h[2]
+        smlal           v16.4s,  v19.4h,  v0.h[3]
+        smull           v20.4s,  v20.4h,  v0.h[0]
+        smlal           v20.4s,  v21.4h,  v0.h[1]
+        smlal           v20.4s,  v22.4h,  v0.h[2]
+        smlal           v20.4s,  v23.4h,  v0.h[3]
+        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+        sqxtun          v16.4h,  v16.4s
+        sqxtun2         v16.8h,  v20.4s
+        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
+        umin            v16.8h,  v16.8h,  v31.8h
+.else
+        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
+        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
+.endif
+        st1             {v16.d}[0], [\dst], \d_strd
+        st1             {v16.d}[1], [\ds2], \d_strd
+        b.gt            4b
+        ret
+
+80:
+160:
+320:
+640:
+1280:   // 8xN, 16xN, 32xN, ... h
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v0.8b}, [\xmx]
+        sub             \src,  \src,  #6
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        sub             \s_strd,  \s_strd,  \w, uxtw #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw #1
+.endif
+81:
+        ld1             {v16.8h, v17.8h},  [\src], #32
+        ld1             {v20.8h, v21.8h},  [\sr2], #32
+        mov             \mx, \w
+
+8:
+        smull           v18.4s,  v16.4h,  v0.h[0]
+        smull2          v19.4s,  v16.8h,  v0.h[0]
+        smull           v22.4s,  v20.4h,  v0.h[0]
+        smull2          v23.4s,  v20.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+        smlal           v18.4s,  v24.4h,  v0.h[\i]
+        smlal2          v19.4s,  v24.8h,  v0.h[\i]
+        smlal           v22.4s,  v25.4h,  v0.h[\i]
+        smlal2          v23.4s,  v25.8h,  v0.h[\i]
+.endr
+        subs            \mx, \mx, #8
+        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+        sqxtun          v18.4h,  v18.4s
+        sqxtun2         v18.8h,  v19.4s
+        sqxtun          v22.4h,  v22.4s
+        sqxtun2         v22.8h,  v23.4s
+        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
+        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
+        umin            v18.8h,  v18.8h,  v31.8h
+        umin            v22.8h,  v22.8h,  v31.8h
+.else
+        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
+        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
+        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
+        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
+.endif
+        st1             {v18.8h}, [\dst], #16
+        st1             {v22.8h}, [\ds2], #16
+        b.le            9f
+
+        mov             v16.16b, v17.16b
+        mov             v20.16b, v21.16b
+        ld1             {v17.8h}, [\src], #16
+        ld1             {v21.8h}, [\sr2], #16
+        b               8b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            81b
+        ret
+
+L(\type\()_8tap_h_tbl):
+        .hword L(\type\()_8tap_h_tbl) - 1280b
+        .hword L(\type\()_8tap_h_tbl) -  640b
+        .hword L(\type\()_8tap_h_tbl) -  320b
+        .hword L(\type\()_8tap_h_tbl) -  160b
+        .hword L(\type\()_8tap_h_tbl) -   80b
+        .hword L(\type\()_8tap_h_tbl) -   40b
+        .hword L(\type\()_8tap_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            w10, \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my, w10
+4:
+        add             \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        adr             x10, L(\type\()_8tap_v_tbl)
+        ldrh            w9,  [x10, x9, lsl #1]
+.ifc \type, prep
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.endif
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+20:     // 2xN v
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        b.gt            28f
+
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        // 2x2 v
+        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        interleave_1_s  v1,  v2,  v3,  v4,  v5
+        b.gt            24f
+        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        sqrshrun_h      6,   v6
+        umin_h          v31, .8h, v6
+        st_s            \d_strd, v6, 2
+        ret
+
+24:     // 2x4 v
+        load_s          \sr2, \src, \s_strd, v6, v7
+        interleave_1_s  v5,  v6,  v7
+        smull_smlal_4   v16, v1,  v2,  v3,  v4
+        smull_smlal_4   v17, v3,  v4,  v5,  v6
+        sqrshrun_h      6,   v16, v17
+        umin_h          v31, .8h, v16
+        st_s            \d_strd, v16, 4
+        ret
+
+28:     // 2x6, 2x8, 2x12, 2x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        sxtl            v0.8h,   v0.8b
+
+        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
+        interleave_1_s  v1,  v2,  v3,  v4,  v5
+        interleave_1_s  v5,  v6,  v7
+216:
+        subs            \h,  \h,  #4
+        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
+        interleave_1_s  v7,  v16, v17, v18, v19
+        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
+        sqrshrun_h      6,   v24, v25
+        umin_h          v31, .8h, v24
+        st_s            \d_strd, v24, 4
+        b.le            0f
+        cmp             \h,  #2
+        mov             v1.16b,  v5.16b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+        mov             v4.16b,  v16.16b
+        mov             v5.16b,  v17.16b
+        mov             v6.16b,  v18.16b
+        mov             v7.16b,  v19.16b
+        b.eq            26f
+        b               216b
+26:
+        load_s          \sr2, \src, \s_strd, v16, v17
+        interleave_1_s  v7,  v16, v17
+        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        sqrshrun_h      6,   v24
+        umin_h          v31, .4h, v24
+        st_s            \d_strd, v24, 2
+0:
+        ret
+.endif
+
+40:
+        AARCH64_VALID_JUMP_TARGET
+        b.gt            480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        smull_smlal_4   v7,  v2,  v3,  v4,  v5
+        shift_store_4   \type, \d_strd, v6, v7
+        b.le            0f
+        load_4h         \sr2, \src, \s_strd, v6, v7
+        smull_smlal_4   v1,  v3,  v4,  v5,  v6
+        smull_smlal_4   v2,  v4,  v5,  v6,  v7
+        shift_store_4   \type, \d_strd, v1, v2
+0:
+        ret
+
+480:    // 4x6, 4x8, 4x12, 4x16 v
+        ld1             {v0.8b}, [\xmy]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+        subs            \h,  \h,  #4
+        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        shift_store_4   \type, \d_strd, v1, v2, v3, v4
+        b.le            0f
+        cmp             \h,  #2
+        mov             v16.8b,  v20.8b
+        mov             v17.8b,  v21.8b
+        mov             v18.8b,  v22.8b
+        mov             v19.8b,  v23.8b
+        mov             v20.8b,  v24.8b
+        mov             v21.8b,  v25.8b
+        mov             v22.8b,  v26.8b
+        b.eq            46f
+        b               48b
+46:
+        load_4h         \sr2, \src, \s_strd, v23, v24
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        shift_store_4   \type, \d_strd, v1, v2
+0:
+        ret
+
+80:
+        AARCH64_VALID_JUMP_TARGET
+        b.gt            880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+
+        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+        smull_smlal_4   v16, v1,  v2,  v3,  v4
+        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
+        smull_smlal_4   v18, v2,  v3,  v4,  v5
+        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
+        shift_store_8   \type, \d_strd, v16, v17, v18, v19
+        b.le            0f
+        load_8h         \sr2, \src, \s_strd, v6, v7
+        smull_smlal_4   v16, v3,  v4,  v5,  v6
+        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
+        smull_smlal_4   v18, v4,  v5,  v6,  v7
+        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
+        shift_store_8   \type, \d_strd, v16, v17, v18, v19
+0:
+        ret
+
+880:    // 8x6, 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v0.8b}, [\xmy]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        sxtl            v0.8h,   v0.8b
+        mov             \my,  \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+        subs            \h,  \h,  #2
+        load_8h         \sr2, \src, \s_strd, v23, v24
+        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
+        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        b.le            9f
+        subs            \h,  \h,  #2
+        load_8h         \sr2, \src, \s_strd, v25, v26
+        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        shift_store_8   \type, \d_strd, v1, v2, v3, v4
+        b.le            9f
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #16
+        add             \dst, \dst, #16
+        b               168b
+0:
+        ret
+
+160:
+        AARCH64_VALID_JUMP_TARGET
+        b.gt            1680b
+
+        // 16x2, 16x4 v
+        add             \xmy, \xmy, #2
+        ld1             {v0.s}[0], [\xmy]
+        sub             \src, \src, \s_strd
+        sxtl            v0.8h,   v0.8b
+
+        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+        load_16h        \src, \src, \s_strd, v22, v23
+        subs            \h,  \h,  #1
+        smull_smlal_4   v1,  v16, v18, v20, v22
+        smull2_smlal2_4 v2,  v16, v18, v20, v22
+        smull_smlal_4   v3,  v17, v19, v21, v23
+        smull2_smlal2_4 v4,  v17, v19, v21, v23
+        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
+        b.le            0f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        b               16b
+0:
+        ret
+
+L(\type\()_8tap_v_tbl):
+        .hword L(\type\()_8tap_v_tbl) - 1280b
+        .hword L(\type\()_8tap_v_tbl) -  640b
+        .hword L(\type\()_8tap_v_tbl) -  320b
+        .hword L(\type\()_8tap_v_tbl) -  160b
+        .hword L(\type\()_8tap_v_tbl) -   80b
+        .hword L(\type\()_8tap_v_tbl) -   40b
+        .hword L(\type\()_8tap_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            w10, \my, #7, #7
+        and             \my, \my, #0x7f
+        b.le            4f
+        mov             \my,  w10
+4:
+        add             \xmy, x11, \my, uxtw #3
+
+        adr             x10, L(\type\()_8tap_hv_tbl)
+        dup             v30.4s,  w12           // 6 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
+.ifc \type, put
+        dup             v29.4s,  w13           // 6 + intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
+.endif
+        br              x10
+
+20:
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        add             \xmx,  \xmx,  #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            280f
+        add             \xmy,  \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        ld1             {v27.8h}, [\src], \s_strd
+        ext             v28.16b, v27.16b, v27.16b, #2
+        smull           v27.4s,  v27.4h,  v0.4h
+        smull           v28.4s,  v28.4h,  v0.4h
+        addp            v27.4s,  v27.4s,  v28.4s
+        addp            v16.4s,  v27.4s,  v27.4s
+        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        bl              L(\type\()_8tap_filter_2)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
+
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]
+
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        umin            v2.4h,   v2.4h,   v31.4h
+        subs            \h,  \h,  #2
+        st1             {v2.s}[0], [\dst], \d_strd
+        st1             {v2.s}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        ld1             {v27.8h}, [\src], \s_strd
+        ext             v28.16b, v27.16b, v27.16b, #2
+        smull           v27.4s,  v27.4h,  v0.4h
+        smull           v28.4s,  v28.4h,  v0.4h
+        addp            v27.4s,  v27.4s,  v28.4s
+        addp            v16.4s,  v27.4s,  v27.4s
+        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+
+        bl              L(\type\()_8tap_filter_2)
+        xtn             v16.4h,  v16.4s
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        mov             v19.8b,  v24.8b
+        bl              L(\type\()_8tap_filter_2)
+        ext             v20.8b,  v19.8b,  v24.8b,  #4
+        mov             v21.8b,  v24.8b
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        ext             v22.8b,  v21.8b,  v24.8b,  #4
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]
+
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        umin            v3.4h,   v3.4h,   v31.4h
+        subs            \h,  \h,  #2
+        st1             {v3.s}[0], [\dst], \d_strd
+        st1             {v3.s}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
+        b               28b
+
+0:
+        ret             x15
+
+L(\type\()_8tap_filter_2):
+        ld1             {v25.8h},  [\sr2], \s_strd
+        ld1             {v27.8h},  [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v28.16b, v27.16b, v27.16b, #2
+        trn1            v24.2s,  v25.2s,  v27.2s
+        trn2            v27.2s,  v25.2s,  v27.2s
+        trn1            v25.2s,  v26.2s,  v28.2s
+        trn2            v28.2s,  v26.2s,  v28.2s
+        smull           v24.4s,  v24.4h,  v0.h[0]
+        smlal           v24.4s,  v25.4h,  v0.h[1]
+        smlal           v24.4s,  v27.4h,  v0.h[2]
+        smlal           v24.4s,  v28.4h,  v0.h[3]
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
+        ret
+.endif
+
+40:
+        AARCH64_VALID_JUMP_TARGET
+        add             \xmx, \xmx, #2
+        ld1             {v0.s}[0],  [\xmx]
+        b.gt            480f
+        add             \xmy, \xmy,  #2
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        // 4x2, 4x4 hv
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]
+        smull           v3.4s,   v17.4h,  v1.h[0]
+        smlal           v3.4s,   v18.4h,  v1.h[1]
+        smlal           v3.4s,   v24.4h,  v1.h[2]
+        smlal           v3.4s,   v25.4h,  v1.h[3]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+
+        st1             {v2.d}[0], [\dst], \d_strd
+        st1             {v2.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+        b               4b
+
+480:    // 4x8, 4x16, 4x32 hv
+        ld1             {v1.8b},  [\xmy]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
+
+        bl              L(\type\()_8tap_filter_4)
+        mov             v17.8b,  v24.8b
+        mov             v18.8b,  v25.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v19.8b,  v24.8b
+        mov             v20.8b,  v25.8b
+        bl              L(\type\()_8tap_filter_4)
+        mov             v21.8b,  v24.8b
+        mov             v22.8b,  v25.8b
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal           v4.4s,   v19.4h,  v1.h[2]
+        smlal           v4.4s,   v20.4h,  v1.h[3]
+        smlal           v4.4s,   v21.4h,  v1.h[4]
+        smlal           v4.4s,   v22.4h,  v1.h[5]
+        smlal           v4.4s,   v24.4h,  v1.h[6]
+        smlal           v4.4s,   v25.4h,  v1.h[7]
+.ifc \type, put
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v3.4h,   v3.4s
+        sqxtun2         v3.8h,   v4.4s
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v3.4h,   v3.4s,   #6
+        rshrn2          v3.8h,   v4.4s,   #6
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v3.d}[0], [\dst], \d_strd
+        st1             {v3.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
+        mov             v22.8b,  v25.8b
+        b               48b
+0:
+        ret             x15
+
+L(\type\()_8tap_filter_4):
+        ld1             {v24.8h}, [\sr2], \s_strd
+        ld1             {v25.8h}, [\src], \s_strd
+        ext             v26.16b, v24.16b, v24.16b, #2
+        ext             v27.16b, v24.16b, v24.16b, #4
+        ext             v28.16b, v24.16b, v24.16b, #6
+        smull           v24.4s,  v24.4h,  v0.h[0]
+        smlal           v24.4s,  v26.4h,  v0.h[1]
+        smlal           v24.4s,  v27.4h,  v0.h[2]
+        smlal           v24.4s,  v28.4h,  v0.h[3]
+        ext             v26.16b, v25.16b, v25.16b, #2
+        ext             v27.16b, v25.16b, v25.16b, #4
+        ext             v28.16b, v25.16b, v25.16b, #6
+        smull           v25.4s,  v25.4h,  v0.h[0]
+        smlal           v25.4s,  v26.4h,  v0.h[1]
+        smlal           v25.4s,  v27.4h,  v0.h[2]
+        smlal           v25.4s,  v28.4h,  v0.h[3]
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
+        xtn             v25.4h,  v25.4s
+        ret
+
+80:
+160:
+320:
+        AARCH64_VALID_JUMP_TARGET
+        b.gt            880f
+        add             \xmy,  \xmy,  #2
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.s}[0],  [\xmy]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        mov             \my, \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v27.8h, v28.8h},  [\src], \s_strd
+        smull           v24.4s,  v27.4h,  v0.h[0]
+        smull2          v25.4s,  v27.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53),
+        // and conserves register space (no need to clobber v8-v15).
+        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+
+8:
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smull2          v3.4s,   v16.8h,  v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smull2          v5.4s,   v17.8h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal2          v3.4s,   v17.8h,  v1.h[1]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal2          v5.4s,   v18.8h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal2          v3.4s,   v18.8h,  v1.h[2]
+        smlal           v4.4s,   v23.4h,  v1.h[2]
+        smlal2          v5.4s,   v23.8h,  v1.h[2]
+        smlal           v2.4s,   v23.4h,  v1.h[3]
+        smlal2          v3.4s,   v23.8h,  v1.h[3]
+        smlal           v4.4s,   v24.4h,  v1.h[3]
+        smlal2          v5.4s,   v24.8h,  v1.h[3]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        sqxtun          v3.4h,   v4.4s
+        sqxtun2         v3.8h,   v5.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v3.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               164b
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        ld1             {v0.8b},  [\xmx]
+        ld1             {v1.8b},  [\xmy]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        mov             x15, x30
+        mov             \my, \h
+
+168:
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        ld1             {v27.8h, v28.8h},  [\src], \s_strd
+        smull           v24.4s,  v27.4h,  v0.h[0]
+        smull2          v25.4s,  v27.8h,  v0.h[0]
+.irpc i, 1234567
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53),
+        // and conserves register space (no need to clobber v8-v15).
+        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
+
+        bl              L(\type\()_8tap_filter_8)
+        mov             v17.16b, v23.16b
+        mov             v18.16b, v24.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v19.16b, v23.16b
+        mov             v20.16b, v24.16b
+        bl              L(\type\()_8tap_filter_8)
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+
+88:
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smull2          v3.4s,   v16.8h,  v1.h[0]
+        bl              L(\type\()_8tap_filter_8)
+        smull           v4.4s,   v17.4h,  v1.h[0]
+        smull2          v5.4s,   v17.8h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal2          v3.4s,   v17.8h,  v1.h[1]
+        smlal           v4.4s,   v18.4h,  v1.h[1]
+        smlal2          v5.4s,   v18.8h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal2          v3.4s,   v18.8h,  v1.h[2]
+        smlal           v4.4s,   v19.4h,  v1.h[2]
+        smlal2          v5.4s,   v19.8h,  v1.h[2]
+        smlal           v2.4s,   v19.4h,  v1.h[3]
+        smlal2          v3.4s,   v19.8h,  v1.h[3]
+        smlal           v4.4s,   v20.4h,  v1.h[3]
+        smlal2          v5.4s,   v20.8h,  v1.h[3]
+        smlal           v2.4s,   v20.4h,  v1.h[4]
+        smlal2          v3.4s,   v20.8h,  v1.h[4]
+        smlal           v4.4s,   v21.4h,  v1.h[4]
+        smlal2          v5.4s,   v21.8h,  v1.h[4]
+        smlal           v2.4s,   v21.4h,  v1.h[5]
+        smlal2          v3.4s,   v21.8h,  v1.h[5]
+        smlal           v4.4s,   v22.4h,  v1.h[5]
+        smlal2          v5.4s,   v22.8h,  v1.h[5]
+        smlal           v2.4s,   v22.4h,  v1.h[6]
+        smlal2          v3.4s,   v22.8h,  v1.h[6]
+        smlal           v4.4s,   v23.4h,  v1.h[6]
+        smlal2          v5.4s,   v23.8h,  v1.h[6]
+        smlal           v2.4s,   v23.4h,  v1.h[7]
+        smlal2          v3.4s,   v23.8h,  v1.h[7]
+        smlal           v4.4s,   v24.4h,  v1.h[7]
+        smlal2          v5.4s,   v24.8h,  v1.h[7]
+.ifc \type, put
+        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
+        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
+        sqxtun          v2.4h,   v2.4s
+        sqxtun2         v2.8h,   v3.4s
+        sqxtun          v3.4h,   v4.4s
+        sqxtun2         v3.8h,   v5.4s
+        umin            v2.8h,   v2.8h,   v31.8h
+        umin            v3.8h,   v3.8h,   v31.8h
+.else
+        rshrn           v2.4h,   v2.4s,   #6
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
+        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        st1             {v2.8h}, [\dst], \d_strd
+        st1             {v3.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v20.16b, v22.16b
+        mov             v21.16b, v23.16b
+        mov             v22.16b, v24.16b
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               168b
+0:
+        ret             x15
+
+L(\type\()_8tap_filter_8):
+        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
+        ld1             {v6.8h, v7.8h},  [\src], \s_strd
+        smull           v25.4s,  v4.4h,   v0.h[0]
+        smull2          v26.4s,  v4.8h,   v0.h[0]
+        smull           v27.4s,  v6.4h,   v0.h[0]
+        smull2          v28.4s,  v6.8h,   v0.h[0]
+.irpc i, 1234567
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        smlal           v25.4s,  v23.4h,  v0.h[\i]
+        smlal2          v26.4s,  v23.8h,  v0.h[\i]
+        smlal           v27.4s,  v24.4h,  v0.h[\i]
+        smlal2          v28.4s,  v24.8h,  v0.h[\i]
+.endr
+        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
+        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
+        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
+        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
+        ret
+
+L(\type\()_8tap_hv_tbl):
+        .hword L(\type\()_8tap_hv_tbl) - 1280b
+        .hword L(\type\()_8tap_hv_tbl) -  640b
+        .hword L(\type\()_8tap_hv_tbl) -  320b
+        .hword L(\type\()_8tap_hv_tbl) -  160b
+        .hword L(\type\()_8tap_hv_tbl) -   80b
+        .hword L(\type\()_8tap_hv_tbl) -   40b
+        .hword L(\type\()_8tap_hv_tbl) -   20b
+        .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+        ldr             w8,  [sp]
+.endif
+        dup             v1.8h,   \mx
+        dup             v3.8h,   \my
+        mov             w10, #16
+        sub             w9,  w10, \mx
+        sub             w10, w10, \my
+        dup             v0.8h,   w9
+        dup             v2.8h,   w10
+.ifc \type, prep
+        uxtw            \d_strd, \w
+        lsl             \d_strd, \d_strd, #1
+.endif
+
+        clz             \bdmax,   \bdmax       // bitdepth_max
+        clz             w9,  \w
+        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
+        mov             w11, #4
+        sub             w9,  w9,  #24
+        sub             w11, w11, \bdmax  // 4 - intermediate_bits
+        add             w12, \bdmax, #4   // 4 + intermediate_bits
+        cbnz            \mx, L(\type\()_bilin_h)
+        cbnz            \my, L(\type\()_bilin_v)
+        b               \type\()_neon
+
+L(\type\()_bilin_h):
+        cbnz            \my, L(\type\()_bilin_hv)
+
+        adr             x10, L(\type\()_bilin_h_tbl)
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.ifc \type, put
+        dup             v30.8h,  \bdmax   // intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v30.8h,  v30.8h   // -intermediate_bits
+.endif
+        br              x10
+
+20:     // 2xN h
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        ld1             {v4.4h},  [\src], \s_strd
+        ld1             {v6.4h},  [\sr2], \s_strd
+        ext             v5.8b,   v4.8b,   v4.8b,   #2
+        ext             v7.8b,   v6.8b,   v6.8b,   #2
+        trn1            v4.2s,   v4.2s,   v6.2s
+        trn1            v5.2s,   v5.2s,   v7.2s
+        subs            \h,  \h,  #2
+        mul             v4.4h,   v4.4h,   v0.4h
+        mla             v4.4h,   v5.4h,   v1.4h
+        urshl           v4.4h,   v4.4h,   v31.4h
+        urshl           v4.4h,   v4.4h,   v30.4h
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        b.gt            2b
+        ret
+.endif
+
+40:     // 4xN h
+        AARCH64_VALID_JUMP_TARGET
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        ld1             {v4.8h}, [\src], \s_strd
+        ld1             {v6.8h}, [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v4.16b,  #2
+        ext             v7.16b,  v6.16b,  v6.16b,  #2
+        trn1            v4.2d,   v4.2d,   v6.2d
+        trn1            v5.2d,   v5.2d,   v7.2d
+        subs            \h,  \h,  #2
+        mul             v4.8h,   v4.8h,   v0.8h
+        mla             v4.8h,   v5.8h,   v1.8h
+        urshl           v4.8h,   v4.8h,   v31.8h
+.ifc \type, put
+        urshl           v4.8h,   v4.8h,   v30.8h
+.else
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.gt            4b
+        ret
+
+80:     // 8xN h
+        AARCH64_VALID_JUMP_TARGET
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        ldr             h5,  [\src, #16]
+        ldr             h7,  [\sr2, #16]
+        ld1             {v4.8h}, [\src], \s_strd
+        ld1             {v6.8h}, [\sr2], \s_strd
+        ext             v5.16b,  v4.16b,  v5.16b,  #2
+        ext             v7.16b,  v6.16b,  v7.16b,  #2
+        subs            \h,  \h,  #2
+        mul             v4.8h,   v4.8h,   v0.8h
+        mla             v4.8h,   v5.8h,   v1.8h
+        mul             v6.8h,   v6.8h,   v0.8h
+        mla             v6.8h,   v7.8h,   v1.8h
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v6.8h,   v6.8h,   v31.8h
+.ifc \type, put
+        urshl           v4.8h,   v4.8h,   v30.8h
+        urshl           v6.8h,   v6.8h,   v30.8h
+.else
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v6.8h,   v6.8h,   v29.8h
+.endif
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v6.8h}, [\ds2], \d_strd
+        b.gt            8b
+        ret
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        AARCH64_VALID_JUMP_TARGET
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w, uxtw #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, uxtw #1
+.endif
+161:
+        ld1             {v16.8h},  [\src], #16
+        ld1             {v21.8h},  [\sr2], #16
+        mov             \mx, \w
+
+16:
+        ld1             {v17.8h, v18.8h},  [\src], #32
+        ld1             {v22.8h, v23.8h},  [\sr2], #32
+        ext             v19.16b, v16.16b, v17.16b, #2
+        ext             v20.16b, v17.16b, v18.16b, #2
+        ext             v24.16b, v21.16b, v22.16b, #2
+        ext             v25.16b, v22.16b, v23.16b, #2
+        mul             v16.8h,  v16.8h,  v0.8h
+        mla             v16.8h,  v19.8h,  v1.8h
+        mul             v17.8h,  v17.8h,  v0.8h
+        mla             v17.8h,  v20.8h,  v1.8h
+        mul             v21.8h,  v21.8h,  v0.8h
+        mla             v21.8h,  v24.8h,  v1.8h
+        mul             v22.8h,  v22.8h,  v0.8h
+        mla             v22.8h,  v25.8h,  v1.8h
+        urshl           v16.8h,  v16.8h,  v31.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+        urshl           v21.8h,  v21.8h,  v31.8h
+        urshl           v22.8h,  v22.8h,  v31.8h
+        subs            \mx, \mx, #16
+.ifc \type, put
+        urshl           v16.8h,  v16.8h,  v30.8h
+        urshl           v17.8h,  v17.8h,  v30.8h
+        urshl           v21.8h,  v21.8h,  v30.8h
+        urshl           v22.8h,  v22.8h,  v30.8h
+.else
+        sub             v16.8h,  v16.8h,  v29.8h
+        sub             v17.8h,  v17.8h,  v29.8h
+        sub             v21.8h,  v21.8h,  v29.8h
+        sub             v22.8h,  v22.8h,  v29.8h
+.endif
+        st1             {v16.8h, v17.8h}, [\dst], #32
+        st1             {v21.8h, v22.8h}, [\ds2], #32
+        b.le            9f
+
+        mov             v16.16b, v18.16b
+        mov             v21.16b, v23.16b
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        b.gt            161b
+        ret
+
+L(\type\()_bilin_h_tbl):
+        .hword L(\type\()_bilin_h_tbl) - 1280b
+        .hword L(\type\()_bilin_h_tbl) -  640b
+        .hword L(\type\()_bilin_h_tbl) -  320b
+        .hword L(\type\()_bilin_h_tbl) -  160b
+        .hword L(\type\()_bilin_h_tbl) -   80b
+        .hword L(\type\()_bilin_h_tbl) -   40b
+        .hword L(\type\()_bilin_h_tbl) -   20b
+        .hword 0
+
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+.endif
+        ldrh            w9,  [x10, x9, lsl #1]
+.ifc \type, prep
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.endif
+        sub             x10, x10, w9, uxtw
+        br              x10
+
+20:     // 2xN v
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        ld1             {v16.s}[0], [\src], \s_strd
+        b.gt            24f
+22:
+        ld1             {v17.s}[0], [\sr2], \s_strd
+        ld1             {v18.s}[0], [\src], \s_strd
+        trn1            v16.2s,  v16.2s,  v17.2s
+        trn1            v17.2s,  v17.2s,  v18.2s
+        mul             v4.4h,   v16.4h,  v2.4h
+        mla             v4.4h,   v17.4h,  v3.4h
+        urshr           v4.8h,   v4.8h,   #4
+        st1             {v4.s}[0], [\dst]
+        st1             {v4.s}[1], [\ds2]
+        ret
+24:     // 2x4, 2x6, 2x8, ... v
+        ld1             {v17.s}[0], [\sr2], \s_strd
+        ld1             {v18.s}[0], [\src], \s_strd
+        ld1             {v19.s}[0], [\sr2], \s_strd
+        ld1             {v20.s}[0], [\src], \s_strd
+        sub             \h,  \h,  #4
+        trn1            v16.2s,  v16.2s,  v17.2s
+        trn1            v17.2s,  v17.2s,  v18.2s
+        trn1            v18.2s,  v18.2s,  v19.2s
+        trn1            v19.2s,  v19.2s,  v20.2s
+        trn1            v16.2d,  v16.2d,  v18.2d
+        trn1            v17.2d,  v17.2d,  v19.2d
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        cmp             \h,  #2
+        urshr           v4.8h,   v4.8h,   #4
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        st1             {v4.s}[2], [\dst], \d_strd
+        st1             {v4.s}[3], [\ds2], \d_strd
+        b.lt            0f
+        mov             v16.8b,  v20.8b
+        b.eq            22b
+        b               24b
+0:
+        ret
+.endif
+
+40:     // 4xN v
+        AARCH64_VALID_JUMP_TARGET
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.4h}, [\src], \s_strd
+4:
+        ld1             {v17.4h}, [\sr2], \s_strd
+        ld1             {v18.4h}, [\src], \s_strd
+        trn1            v16.2d,  v16.2d,  v17.2d
+        trn1            v17.2d,  v17.2d,  v18.2d
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.le            0f
+        mov             v16.8b,  v18.8b
+        b               4b
+0:
+        ret
+
+80:     // 8xN v
+        AARCH64_VALID_JUMP_TARGET
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        ld1             {v16.8h}, [\src], \s_strd
+8:
+        ld1             {v17.8h}, [\sr2], \s_strd
+        ld1             {v18.8h}, [\src], \s_strd
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v17.8h,  v3.8h
+        mul             v5.8h,   v17.8h,  v2.8h
+        mla             v5.8h,   v18.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+        urshr           v5.8h,   v5.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v5.8h,   v5.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+.endif
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+        b.le            0f
+        mov             v16.16b, v18.16b
+        b               8b
+0:
+        ret
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        mov             \my, \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v16.8h, v17.8h}, [\src], \s_strd
+2:
+        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
+        ld1             {v20.8h, v21.8h}, [\src], \s_strd
+        mul             v4.8h,   v16.8h,  v2.8h
+        mla             v4.8h,   v18.8h,  v3.8h
+        mul             v5.8h,   v17.8h,  v2.8h
+        mla             v5.8h,   v19.8h,  v3.8h
+        mul             v6.8h,   v18.8h,  v2.8h
+        mla             v6.8h,   v20.8h,  v3.8h
+        mul             v7.8h,   v19.8h,  v2.8h
+        mla             v7.8h,   v21.8h,  v3.8h
+        subs            \h,  \h,  #2
+.ifc \type, put
+        urshr           v4.8h,   v4.8h,   #4
+        urshr           v5.8h,   v5.8h,   #4
+        urshr           v6.8h,   v6.8h,   #4
+        urshr           v7.8h,   v7.8h,   #4
+.else
+        urshl           v4.8h,   v4.8h,   v31.8h
+        urshl           v5.8h,   v5.8h,   v31.8h
+        urshl           v6.8h,   v6.8h,   v31.8h
+        urshl           v7.8h,   v7.8h,   v31.8h
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+        sub             v6.8h,   v6.8h,   v29.8h
+        sub             v7.8h,   v7.8h,   v29.8h
+.endif
+        st1             {v4.8h, v5.8h}, [\dst], \d_strd
+        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        b.le            0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        msub            \src, \s_strd, \xmy, \src
+        msub            \dst, \d_strd, \xmy, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #32
+        add             \dst, \dst, #32
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_v_tbl):
+        .hword L(\type\()_bilin_v_tbl) - 1280b
+        .hword L(\type\()_bilin_v_tbl) -  640b
+        .hword L(\type\()_bilin_v_tbl) -  320b
+        .hword L(\type\()_bilin_v_tbl) -  160b
+        .hword L(\type\()_bilin_v_tbl) -   80b
+        .hword L(\type\()_bilin_v_tbl) -   40b
+        .hword L(\type\()_bilin_v_tbl) -   20b
+        .hword 0
+
+L(\type\()_bilin_hv):
+        adr             x10, L(\type\()_bilin_hv_tbl)
+        dup             v31.8h,  w11      // 4 - intermediate_bits
+        ldrh            w9,  [x10, x9, lsl #1]
+        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
+.ifc \type, put
+        dup             v30.4s,  w12      // 4 + intermediate_bits
+.else
+        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        sub             x10, x10, w9, uxtw
+.ifc \type, put
+        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
+.endif
+        br              x10
+
+20:     // 2xN hv
+        AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v20.4h},  [\src], \s_strd
+        ext             v21.8b,  v20.8b,  v20.8b,  #2
+        mul             v16.4h,  v20.4h,  v0.4h
+        mla             v16.4h,  v21.4h,  v1.4h
+        urshl           v16.4h,  v16.4h,  v31.4h
+
+2:
+        ld1             {v22.4h},  [\sr2], \s_strd
+        ld1             {v24.4h},  [\src], \s_strd
+        ext             v23.8b,  v22.8b,  v22.8b,  #2
+        ext             v25.8b,  v24.8b,  v24.8b,  #2
+        trn1            v22.2s,  v22.2s,  v24.2s
+        trn1            v23.2s,  v23.2s,  v25.2s
+        mul             v17.4h,  v22.4h,  v0.4h
+        mla             v17.4h,  v23.4h,  v1.4h
+        urshl           v17.4h,  v17.4h,  v31.4h
+
+        trn1            v16.2s,  v16.2s,  v17.2s
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        urshl           v4.4s,   v4.4s,   v30.4s
+        xtn             v4.4h,   v4.4s
+        subs            \h,  \h,  #2
+        st1             {v4.s}[0], [\dst], \d_strd
+        st1             {v4.s}[1], [\ds2], \d_strd
+        b.le            0f
+        trn2            v16.2s,  v17.2s,  v17.2s
+        b               2b
+0:
+        ret
+.endif
+
+40:     // 4xN hv
+        AARCH64_VALID_JUMP_TARGET
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ld1             {v20.8h},  [\src], \s_strd
+        ext             v21.16b, v20.16b, v20.16b, #2
+        mul             v16.4h,  v20.4h,  v0.4h
+        mla             v16.4h,  v21.4h,  v1.4h
+        urshl           v16.4h,  v16.4h,  v31.4h
+
+4:
+        ld1             {v22.8h},  [\sr2], \s_strd
+        ld1             {v24.8h},  [\src], \s_strd
+        ext             v23.16b, v22.16b, v22.16b, #2
+        ext             v25.16b, v24.16b, v24.16b, #2
+        trn1            v22.2d,  v22.2d,  v24.2d
+        trn1            v23.2d,  v23.2d,  v25.2d
+        mul             v17.8h,  v22.8h,  v0.8h
+        mla             v17.8h,  v23.8h,  v1.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+
+        trn1            v16.2d,  v16.2d,  v17.2d
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        umull2          v5.4s,   v16.8h,  v2.8h
+        umlal2          v5.4s,   v17.8h,  v3.8h
+.ifc \type, put
+        urshl           v4.4s,   v4.4s,   v30.4s
+        urshl           v5.4s,   v5.4s,   v30.4s
+        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
+.else
+        rshrn           v4.4h,   v4.4s,   #4
+        rshrn2          v4.8h,   v5.4s,   #4
+        sub             v4.8h,   v4.8h,   v29.8h
+.endif
+        subs            \h,  \h,  #2
+        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.d}[1], [\ds2], \d_strd
+        b.le            0f
+        trn2            v16.2d,  v17.2d,  v17.2d
+        b               4b
+0:
+        ret
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        mov             \my, \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        ldr             h21, [\src, #16]
+        ld1             {v20.8h},  [\src], \s_strd
+        ext             v21.16b, v20.16b, v21.16b, #2
+        mul             v16.8h,  v20.8h,  v0.8h
+        mla             v16.8h,  v21.8h,  v1.8h
+        urshl           v16.8h,  v16.8h,  v31.8h
+
+2:
+        ldr             h23, [\sr2, #16]
+        ld1             {v22.8h},  [\sr2], \s_strd
+        ldr             h25, [\src, #16]
+        ld1             {v24.8h},  [\src], \s_strd
+        ext             v23.16b, v22.16b, v23.16b, #2
+        ext             v25.16b, v24.16b, v25.16b, #2
+        mul             v17.8h,  v22.8h,  v0.8h
+        mla             v17.8h,  v23.8h,  v1.8h
+        mul             v18.8h,  v24.8h,  v0.8h
+        mla             v18.8h,  v25.8h,  v1.8h
+        urshl           v17.8h,  v17.8h,  v31.8h
+        urshl           v18.8h,  v18.8h,  v31.8h
+
+        umull           v4.4s,   v16.4h,  v2.4h
+        umlal           v4.4s,   v17.4h,  v3.4h
+        umull2          v5.4s,   v16.8h,  v2.8h
+        umlal2          v5.4s,   v17.8h,  v3.8h
+        umull           v6.4s,   v17.4h,  v2.4h
+        umlal           v6.4s,   v18.4h,  v3.4h
+        umull2          v7.4s,   v17.8h,  v2.8h
+        umlal2          v7.4s,   v18.8h,  v3.8h
+.ifc \type, put
+        urshl           v4.4s,   v4.4s,   v30.4s
+        urshl           v5.4s,   v5.4s,   v30.4s
+        urshl           v6.4s,   v6.4s,   v30.4s
+        urshl           v7.4s,   v7.4s,   v30.4s
+        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
+        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
+.else
+        rshrn           v4.4h,   v4.4s,   #4
+        rshrn2          v4.8h,   v5.4s,   #4
+        rshrn           v5.4h,   v6.4s,   #4
+        rshrn2          v5.8h,   v7.4s,   #4
+        sub             v4.8h,   v4.8h,   v29.8h
+        sub             v5.8h,   v5.8h,   v29.8h
+.endif
+        subs            \h,  \h,  #2
+        st1             {v4.8h}, [\dst], \d_strd
+        st1             {v5.8h}, [\ds2], \d_strd
+        b.le            9f
+        mov             v16.16b, v18.16b
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        b.le            0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        msub            \src,  \s_strd,  \xmy,  \src
+        msub            \dst,  \d_strd,  \xmy,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               1b
+0:
+        ret
+
+L(\type\()_bilin_hv_tbl):
+        .hword L(\type\()_bilin_hv_tbl) - 1280b
+        .hword L(\type\()_bilin_hv_tbl) -  640b
+        .hword L(\type\()_bilin_hv_tbl) -  320b
+        .hword L(\type\()_bilin_hv_tbl) -  160b
+        .hword L(\type\()_bilin_hv_tbl) -   80b
+        .hword L(\type\()_bilin_hv_tbl) -   40b
+        .hword L(\type\()_bilin_hv_tbl) -   20b
+        .hword 0
+endfunc
+.endm
+
+filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
+.macro load_filter_row dst, src, inc
+        asr             w13, \src, #10
+        add             \src, \src, \inc
+        ldr             \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+        add             w12, w5,  #512
+
+        ld1             {v16.8h, v17.8h}, [x2], x3
+
+        load_filter_row d0, w12, w7
+        load_filter_row d1, w12, w7
+        load_filter_row d2, w12, w7
+        sxtl            v0.8h,   v0.8b
+        load_filter_row d3, w12, w7
+        sxtl            v1.8h,   v1.8b
+        load_filter_row d4, w12, w7
+        sxtl            v2.8h,   v2.8b
+        load_filter_row d5, w12, w7
+        sxtl            v3.8h,   v3.8b
+        load_filter_row d6, w12, w7
+        sxtl            v4.8h,   v4.8b
+        load_filter_row d7, w12, w7
+        sxtl            v5.8h,   v5.8b
+        ext             v18.16b, v16.16b, v17.16b, #2*1
+        smull           v8.4s,   v16.4h,  v0.4h
+        smull2          v9.4s,   v16.8h,  v0.8h
+        sxtl            v6.8h,   v6.8b
+        ext             v19.16b, v16.16b, v17.16b, #2*2
+        smull           v10.4s,  v18.4h,  v1.4h
+        smull2          v11.4s,  v18.8h,  v1.8h
+        sxtl            v7.8h,   v7.8b
+        ext             v20.16b, v16.16b, v17.16b, #2*3
+        smull           v0.4s,   v19.4h,  v2.4h
+        smull2          v1.4s,   v19.8h,  v2.8h
+        ext             v21.16b, v16.16b, v17.16b, #2*4
+        addp            v8.4s,   v8.4s,   v9.4s
+        smull           v2.4s,   v20.4h,  v3.4h
+        smull2          v3.4s,   v20.8h,  v3.8h
+        ext             v22.16b, v16.16b, v17.16b, #2*5
+        addp            v9.4s,   v10.4s,  v11.4s
+        smull           v10.4s,  v21.4h,  v4.4h
+        smull2          v11.4s,  v21.8h,  v4.8h
+        ext             v23.16b, v16.16b, v17.16b, #2*6
+        addp            v0.4s,   v0.4s,   v1.4s
+        smull           v18.4s,  v22.4h,  v5.4h
+        smull2          v19.4s,  v22.8h,  v5.8h
+        ext             v16.16b, v16.16b, v17.16b, #2*7
+        addp            v1.4s,   v2.4s,   v3.4s
+        addp            v2.4s,   v10.4s,  v11.4s
+        smull           v20.4s,  v23.4h,  v6.4h
+        smull2          v21.4s,  v23.8h,  v6.8h
+        addp            v3.4s,   v18.4s,  v19.4s
+        smull           v22.4s,  v16.4h,  v7.4h
+        smull2          v23.4s,  v16.8h,  v7.8h
+        addp            v4.4s,   v20.4s,  v21.4s
+        addp            v5.4s,   v22.4s,  v23.4s
+
+        addp            v8.4s,   v8.4s,   v9.4s
+        addp            v0.4s,   v0.4s,   v1.4s
+        addp            v2.4s,   v2.4s,   v3.4s
+        addp            v4.4s,   v4.4s,   v5.4s
+
+        addp            v16.4s,  v8.4s,   v0.4s
+        addp            v17.4s,  v2.4s,   v4.4s
+
+        add             w5,  w5,  w8
+
+        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
+        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
+
+        ret
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my,
+//         const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+.ifb \t
+        dup             v15.8h,  w7        // bitdepth_max
+.else
+        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
+.endif
+        clz             w7,  w7
+                                           // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
+.ifb \t
+        neg             w8,  w8            // -(7 + intermediate_bits)
+.endif
+        dup             v14.4s,  w7        // -(7 - intermediate_bits)
+.ifb \t
+        dup             v13.4s,  w8        // -(7 + intermediate_bits)
+.endif
+
+        ldr             x4,  [x4]
+        sbfx            x7,  x4, #0,  #16
+        sbfx            x8,  x4, #16, #16
+        sbfx            x9,  x4, #32, #16
+        sbfx            x4,  x4, #48, #16
+        mov             w10, #8
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #6
+        movrel          x11, X(mc_warp_filter), 64*8
+        mov             x15, x30
+.ifnb \t
+        lsl             x1,  x1,  #1
+.endif
+
+        bl              warp_filter_horz_neon
+        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
+        bl              warp_filter_horz_neon
+        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
+        bl              warp_filter_horz_neon
+        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
+        bl              warp_filter_horz_neon
+        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
+        bl              warp_filter_horz_neon
+        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
+        bl              warp_filter_horz_neon
+        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
+        bl              warp_filter_horz_neon
+        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
+
+1:
+        add             w14, w6,  #512
+        bl              warp_filter_horz_neon
+        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
+
+        load_filter_row d0, w14, w9
+        load_filter_row d1, w14, w9
+        load_filter_row d2, w14, w9
+        load_filter_row d3, w14, w9
+        load_filter_row d4, w14, w9
+        load_filter_row d5, w14, w9
+        load_filter_row d6, w14, w9
+        load_filter_row d7, w14, w9
+        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+        // This ordering of smull/smlal/smull2/smlal2 is highly
+        // beneficial for Cortex A53 here.
+        smull           v16.4s,  v24.4h,  v0.4h
+        smlal           v16.4s,  v25.4h,  v1.4h
+        smlal           v16.4s,  v26.4h,  v2.4h
+        smlal           v16.4s,  v27.4h,  v3.4h
+        smlal           v16.4s,  v28.4h,  v4.4h
+        smlal           v16.4s,  v29.4h,  v5.4h
+        smlal           v16.4s,  v30.4h,  v6.4h
+        smlal           v16.4s,  v31.4h,  v7.4h
+        smull2          v17.4s,  v24.8h,  v0.8h
+        smlal2          v17.4s,  v25.8h,  v1.8h
+        smlal2          v17.4s,  v26.8h,  v2.8h
+        smlal2          v17.4s,  v27.8h,  v3.8h
+        smlal2          v17.4s,  v28.8h,  v4.8h
+        smlal2          v17.4s,  v29.8h,  v5.8h
+        smlal2          v17.4s,  v30.8h,  v6.8h
+        smlal2          v17.4s,  v31.8h,  v7.8h
+
+        mov             v24.16b, v25.16b
+        mov             v25.16b, v26.16b
+.ifb \t
+        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
+        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
+.else
+        rshrn           v16.4h,  v16.4s,  #7
+        rshrn2          v16.8h,  v17.4s,  #7
+.endif
+        mov             v26.16b, v27.16b
+.ifb \t
+        sqxtun          v16.4h,  v16.4s
+        sqxtun2         v16.8h,  v17.4s
+.else
+        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
+.endif
+        mov             v27.16b, v28.16b
+        mov             v28.16b, v29.16b
+.ifb \t
+        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
+.endif
+        mov             v29.16b, v30.16b
+        mov             v30.16b, v31.16b
+        subs            w10, w10, #1
+        st1             {v16.8h}, [x0], x1
+
+        add             w6,  w6,  w4
+        b.gt            1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+
+        ret             x15
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+        ldp             x8,  x9,  [sp]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             x12, x3,  #1           // ih - 1
+        cmp             x5,  x3
+        sub             x13, x2,  #1           // iw - 1
+        csel            x12, x12, x5,  ge      // min(y, ih - 1)
+        cmp             x4,  x2
+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+        csel            x13, x13, x4,  ge      // min(x, iw - 1)
+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
+        add             x8,  x8,  x13, lsl #1  // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             x10, x5,  x1           // y + bh
+        neg             x5,  x5                // -y
+        sub             x10, x10, x3           // y + bh - ih
+        sub             x12, x1,  #1           // bh - 1
+        cmp             x10, x1
+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
+        cmp             x5,  x1
+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             x11, x4,  x0           // x + bw
+        neg             x4,  x4                // -x
+        sub             x11, x11, x2           // x + bw - iw
+        sub             x13, x0,  #1           // bw - 1
+        cmp             x11, x0
+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
+        cmp             x4,  x0
+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             x1,  x1,  x5           // bh - top_ext
+        madd            x6,  x5,  x7,  x6
+        sub             x2,  x0,  x4           // bw - left_ext
+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
+
+        mov             x14, x6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        ld1r            {v0.8h}, [x8]
+        mov             x12, x6                // out = dst
+        mov             x3,  x4
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+        mov             x13, x8
+        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
+        mov             x3,  x2
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+        subs            x3,  x3,  #32
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+        b.gt            1b
+.if \need_right
+        add             x3,  x8,  x2, lsl #1   // in + center_w
+        sub             x3,  x3,  #2           // in + center_w - 1
+        add             x12, x6,  x4, lsl #1   // dst + left_ext
+        ld1r            {v0.8h}, [x3]
+        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
+        mov             x3,  x11
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+
+        subs            x1,  x1,  #1           // center_h--
+        add             x6,  x6,  x7
+        add             x8,  x8,  x9
+        b.gt            0b
+.endm
+
+        cbz             x4,  2f
+        // need_left
+        cbz             x11, 3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cbz             x11, 4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+
+        cbz             x10, 3f
+        // need_bottom
+        sub             x8,  x6,  x7           // ref = dst - stride
+        mov             x4,  x0
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+        mov             x3,  x10
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
+        subs            x4,  x4,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        cbz             x5,  3f
+        // need_top
+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+        mov             x3,  x5
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
+        subs            x0,  x0,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        ret
+endfunc