From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 15 May 2024 05:35:49 +0200
Subject: Merging upstream version 126.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 third_party/dav1d/src/arm/64/itx.S   |  99 ++++-----
 third_party/dav1d/src/arm/64/itx16.S |  21 +-
 third_party/dav1d/src/arm/64/mc.S    | 411 ++++++++++++++++++++++++++---------
 third_party/dav1d/src/arm/64/mc16.S  | 373 +++++++++++++++++++++----------
 third_party/dav1d/src/arm/64/msac.S  | 167 +++++++-------
 third_party/dav1d/src/arm/64/util.S  |  49 +++++
 6 files changed, 770 insertions(+), 350 deletions(-)

(limited to 'third_party/dav1d/src/arm/64')

diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
index 53490cd677..7063cbde1d 100644
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon
 
 .ifc \variant, identity_
         // The identity shl #1 and downshift srshr #1 cancel out
+
+        b               L(itx_8x8_epilog)
 .else
         blr             x4
 
@@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon
         srshr           v21.8h,  v21.8h,  #1
         srshr           v22.8h,  v22.8h,  #1
         srshr           v23.8h,  v23.8h,  #1
-.endif
 
+L(itx_8x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
         blr             x5
 
         load_add_store_8x8 x0, x7
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_8x8_base
 def_fn_8x8_base identity_
+def_fn_8x8_base
 
 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon
 .endif
 .if \identity
         identity_8x16_shift2 v0.h[0]
+        b               L(horz_16x8_epilog)
 .else
         blr             x4
-.endif
-.if \shift > 0
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
         srshr           \i,  \i,  #\shift
 .endr
-.endif
+.if \shift == 1
+        b               L(horz_16x8_epilog)
+.else
+L(horz_16x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
 
@@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon
 .endr
 
         ret             x14
+.endif
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, identity=0, shift=2
 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
 def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+def_horz_16 scale=0, identity=0, shift=2
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
@@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon
 .endr
 
         identity_8x16_shift1 v0.h[0]
+
+        b               L(itx_16x4_epilog)
 .else
 .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
         ld1             {\i},    [x2]
@@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
-        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
-        blr             x5
-        mov             x6,  x0
-        load_add_store_8x4 x6, x7
 
-.ifc \variant, identity_
-        mov             v16.16b, v20.16b
-        mov             v17.16b, v21.16b
-        mov             v18.16b, v22.16b
-        mov             v19.16b, v23.16b
-.else
         ins             v24.d[1], v28.d[0]
         ins             v25.d[1], v29.d[0]
         ins             v26.d[1], v30.d[0]
         ins             v27.d[1], v31.d[0]
-        srshr           v16.8h,  v24.8h,  #1
-        srshr           v17.8h,  v25.8h,  #1
-        srshr           v18.8h,  v26.8h,  #1
-        srshr           v19.8h,  v27.8h,  #1
-.endif
+        srshr           v20.8h,  v24.8h,  #1
+        srshr           v21.8h,  v25.8h,  #1
+        srshr           v22.8h,  v26.8h,  #1
+        srshr           v23.8h,  v27.8h,  #1
+
+L(itx_16x4_epilog):
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
+        mov             x6,  x0
+        load_add_store_8x4 x6, x7
+
+        transpose_4x8h_mov v20, v21, v22, v23, v2,  v3,  v4,  v5, v16, v17, v18, v19
+        blr             x5
         add             x6,  x0,  #8
         load_add_store_8x4 x6, x7
 
         ret             x15
+.endif
 endfunc
 
 function inv_txfm_\variant\()add_4x16_neon
@@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon
         mov             w16, #(5793-4096)*8
         dup             v0.4h,   w16
         identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+
+        b               L(itx_4x16_epilog)
 .else
         blr             x4
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
+L(itx_4x16_epilog):
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         ins             v20.d[0], v16.d[1]
         ins             v21.d[0], v17.d[1]
@@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon
         load_add_store_4x16 x0, x6
 
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_416_base
 def_fn_416_base identity_
+def_fn_416_base
 
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
         idct_dc         \w,  \h,  1
 .endif
 .if \w == 4
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
         mov             w13, #\eob_half
 .else
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .endif
 .ifc \txfm1, identity
@@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
         identity_8x16_shift1 v0.h[0]
+
+        b               L(itx_16x8_epilog)
 .else
         blr             x4
 
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
+
+L(itx_16x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
@@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon
         mov             x6,  x0
         load_add_store_8x8 x6, x7
 
-.ifc \variant, identity_
-        mov             v16.16b, v24.16b
-        mov             v17.16b, v25.16b
-        mov             v18.16b, v26.16b
-        mov             v19.16b, v27.16b
-        mov             v20.16b, v28.16b
-        mov             v21.16b, v29.16b
-        mov             v22.16b, v30.16b
-        mov             v23.16b, v31.16b
-.else
-        srshr           v16.8h,  v24.8h,  #1
-        srshr           v17.8h,  v25.8h,  #1
-        srshr           v18.8h,  v26.8h,  #1
-        srshr           v19.8h,  v27.8h,  #1
-        srshr           v20.8h,  v28.8h,  #1
-        srshr           v21.8h,  v29.8h,  #1
-        srshr           v22.8h,  v30.8h,  #1
-        srshr           v23.8h,  v31.8h,  #1
-.endif
-
-        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
 
         blr             x5
 
@@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon
         load_add_store_8x8 x0, x7
 
         ret             x15
+.endif
 endfunc
 
 function inv_txfm_\variant\()add_8x16_neon
@@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
 .ifc \variant, identity_
         // The identity shl #1 and downshift srshr #1 cancel out
+
+        b               L(itx_8x16_epilog)
 .else
         blr             x4
 
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
 
+L(itx_8x16_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
@@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon
         load_add_store_8x16 x0, x6
 
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_816_base
 def_fn_816_base identity_
+def_fn_816_base
 
 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  1
 .endif
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .if \w == 8
         mov             x13, #\eob_half
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
index eee3a9636d..31ee9be1b4 100644
--- a/third_party/dav1d/src/arm/64/itx16.S
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
         b               L(itx_4x4_end)
 endfunc
 
+// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
+// x0-x4  external parameters
+// x5     function pointer to first transform
+// x6     function pointer to second transform
 function inv_txfm_add_4x4_neon
         movi            v30.4s,  #0
         movi            v31.4s,  #0
         ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
         st1             {v30.4s, v31.4s}, [x2], #32
 
-        blr             x4
+        blr             x5
 
         st1             {v30.4s, v31.4s}, [x2], #32
         sqxtn           v16.4h,  v16.4s
@@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon
         sqxtn           v19.4h,  v19.4s
         transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
 
-        blr             x5
+        blr             x6
 
         ld1             {v0.d}[0], [x0], x1
         ld1             {v0.d}[1], [x0], x1
@@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon
         srshr           v18.8h,  v18.8h,  #4
 
 L(itx_4x4_end):
-        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        dup             v31.8h,  w4
         sub             x0,  x0,  x1, lsl #2
         usqadd          v0.8h,   v16.8h
         usqadd          v1.8h,   v18.8h
@@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
         b               L(itx_4x4_end)
 1:
 .endif
-        adr             x4,  inv_\txfm1\()_4s_x4_neon
-        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)
+        adr             x5,  inv_\txfm1\()_4s_x4_neon
+        movrel          x6,  X(inv_\txfm2\()_4h_x4_neon)
         b               inv_txfm_add_4x4_neon
 endfunc
 .endm
@@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon
         sqrshrn2        v21.8h,  v29.4s,  #\shift
         sqrshrn2        v22.8h,  v30.4s,  #\shift
         sqrshrn2        v23.8h,  v31.4s,  #\shift
+.if \scale
+        b               L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
 
@@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon
 .endr
 
         ret             x14
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, shift=2
 def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
index 9f7b4e7a89..3df0393c3a 100644
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@ endfunc
         uxtl            \r6\().8h, \r6\().8b
 .endif
 .endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
         mul             \d\wd,  \s0\wd,  v0.h[0]
         mla             \d\wd,  \s1\wd,  v0.h[1]
         mla             \d\wd,  \s2\wd,  v0.h[2]
@@ -1163,7 +1163,51 @@ endfunc
 // Interleaving the mul/mla chains actually hurts performance
 // significantly on Cortex A53, thus keeping mul/mla tightly
 // chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().4h, \s1\().4h, v0.h[1]
+        mla             \d0\().4h, \s2\().4h, v0.h[2]
+        mla             \d0\().4h, \s3\().4h, v0.h[3]
+        mla             \d0\().4h, \s4\().4h, v0.h[4]
+        mla             \d0\().4h, \s5\().4h, v0.h[5]
+        mla             \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mul             \d1\().8h, \s2\().8h, v0.h[1]
+        mla             \d1\().8h, \s3\().8h, v0.h[2]
+        mla             \d1\().8h, \s4\().8h, v0.h[3]
+        mla             \d1\().8h, \s5\().8h, v0.h[4]
+        mla             \d1\().8h, \s6\().8h, v0.h[5]
+        mla             \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mul             \d1\().8h, \s3\().8h, v0.h[1]
+        mla             \d1\().8h, \s4\().8h, v0.h[2]
+        mla             \d1\().8h, \s5\().8h, v0.h[3]
+        mla             \d1\().8h, \s6\().8h, v0.h[4]
+        mla             \d1\().8h, \s7\().8h, v0.h[5]
+        mla             \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
         mul             \d0\().4h, \s0\().4h, v0.h[0]
         mla             \d0\().4h, \s1\().4h, v0.h[1]
         mla             \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@ endfunc
         mla             \d0\().4h, \s6\().4h, v0.h[6]
         mla             \d0\().4h, \s7\().4h, v0.h[7]
 .endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@ endfunc
         mla             \d0\().8h, \s6\().8h, v0.h[6]
         mla             \d0\().8h, \s7\().8h, v0.h[7]
 .endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@ endfunc
         mla             \d1\().8h, \s7\().8h, v0.h[6]
         mla             \d1\().8h, \s8\().8h, v0.h[7]
 .endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@ endfunc
 .endif
 .endm
 
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
 function \op\()_8tap_\type\()_8bpc_neon, export=1
         mov             x8,  \type_h
         mov             x9,  \type_v
-        b               \op\()_8tap_neon
+        b               \op\()_\taps\()_neon
 endfunc
 .endm
 
@@ -1328,18 +1372,8 @@ endfunc
 #define SMOOTH  ((1*15<<7)|4*15)
 #define SHARP   ((2*15<<7)|3*15)
 
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular,        REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
-make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
-make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
-make_8tap_fn \type, sharp,          SHARP,   SHARP
-make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
-make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
         mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
         mul             \mx,  \mx, w10
         mul             \my,  \my, w10
@@ -1354,12 +1388,12 @@ function \type\()_8tap_neon
         tst             \mx, #(0x7f << 14)
         sub             w8,  w8,  #24
         movrel          x10, X(mc_subpel_filters), -8
-        b.ne            L(\type\()_8tap_h)
+        b.ne            L(\type\()_\taps\()_h)
         tst             \my, #(0x7f << 14)
-        b.ne            L(\type\()_8tap_v)
+        b.ne            L(\type\()_\taps\()_v)
         b               \type\()_neon
 
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
         cmp             \w,  #4
         ubfx            w9,  \mx, #7, #7
         and             \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@ L(\type\()_8tap_h):
 4:
         tst             \my,  #(0x7f << 14)
         add             \xmx, x10, \mx, uxtw #3
-        b.ne            L(\type\()_8tap_hv)
+        b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x9,  L(\type\()_8tap_h_tbl)
+        adr             x9,  L(\type\()_\taps\()_h_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1471,6 +1505,18 @@ L(\type\()_8tap_h):
         uxtl            v20.8h,  v20.8b
         uxtl            v21.8h,  v21.8b
 
+.ifc \taps, 6tap
+        ext             v19.16b, v16.16b, v17.16b, #2
+        ext             v23.16b, v20.16b, v21.16b, #2
+        mul             v18.8h,  v19.8h,  v0.h[1]
+        mul             v22.8h,  v23.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mla             v18.8h,  v19.8h,  v0.h[\i]
+        mla             v22.8h,  v23.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v18.8h,  v16.8h,  v0.h[0]
         mul             v22.8h,  v20.8h,  v0.h[0]
 .irpc i, 1234567
@@ -1479,6 +1525,7 @@ L(\type\()_8tap_h):
         mla             v18.8h,  v19.8h,  v0.h[\i]
         mla             v22.8h,  v23.8h,  v0.h[\i]
 .endr
+.endif
         subs            \h,  \h,  #2
         srshr           v18.8h,  v18.8h, #2
         srshr           v22.8h,  v22.8h, #2
@@ -1523,6 +1570,26 @@ L(\type\()_8tap_h):
         uxtl            v22.8h,  v22.8b
 
 16:
+.ifc \taps, 6tap
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v29.16b, v17.16b, v18.16b, #2
+        ext             v30.16b, v20.16b, v21.16b, #2
+        ext             v31.16b, v21.16b, v22.16b, #2
+        mul             v24.8h,  v28.8h,  v0.h[1]
+        mul             v25.8h,  v29.8h,  v0.h[1]
+        mul             v26.8h,  v30.8h,  v0.h[1]
+        mul             v27.8h,  v31.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mla             v24.8h,  v28.8h,  v0.h[\i]
+        mla             v25.8h,  v29.8h,  v0.h[\i]
+        mla             v26.8h,  v30.8h,  v0.h[\i]
+        mla             v27.8h,  v31.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v24.8h,  v16.8h,  v0.h[0]
         mul             v25.8h,  v17.8h,  v0.h[0]
         mul             v26.8h,  v20.8h,  v0.h[0]
@@ -1537,6 +1604,7 @@ L(\type\()_8tap_h):
         mla             v26.8h,  v30.8h,  v0.h[\i]
         mla             v27.8h,  v31.8h,  v0.h[\i]
 .endr
+.endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
         srshr           v26.8h,  v26.8h, #2
@@ -1575,18 +1643,18 @@ L(\type\()_8tap_h):
         b.gt            161b
         ret
 
-L(\type\()_8tap_h_tbl):
-        .hword L(\type\()_8tap_h_tbl) - 1280b
-        .hword L(\type\()_8tap_h_tbl) -  640b
-        .hword L(\type\()_8tap_h_tbl) -  320b
-        .hword L(\type\()_8tap_h_tbl) -  160b
-        .hword L(\type\()_8tap_h_tbl) -   80b
-        .hword L(\type\()_8tap_h_tbl) -   40b
-        .hword L(\type\()_8tap_h_tbl) -   20b
+L(\type\()_\taps\()_h_tbl):
+        .hword L(\type\()_\taps\()_h_tbl) - 1280b
+        .hword L(\type\()_\taps\()_h_tbl) -  640b
+        .hword L(\type\()_\taps\()_h_tbl) -  320b
+        .hword L(\type\()_\taps\()_h_tbl) -  160b
+        .hword L(\type\()_\taps\()_h_tbl) -   80b
+        .hword L(\type\()_\taps\()_h_tbl) -   40b
+        .hword L(\type\()_\taps\()_h_tbl) -   20b
         .hword 0
 
 
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -1595,7 +1663,7 @@ L(\type\()_8tap_v):
 4:
         add             \xmy, x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_8tap_v_tbl)
+        adr             x9,  L(\type\()_\taps\()_v_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1620,7 +1688,7 @@ L(\type\()_8tap_v):
         interleave_1_h  v1, v2, v3, v4, v5
         b.gt            24f
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .4h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .4h
         sqrshrun_b      6,  v6
         st_h            \d_strd, v6, 2
         ret
@@ -1630,7 +1698,7 @@ L(\type\()_8tap_v):
         interleave_1_h  v5, v6, v7
         interleave_2_s  v1, v2, v3, v4, v5, v6
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
         sqrshrun_b      6,  v6
         st_h            \d_strd, v6, 4
         ret
@@ -1655,7 +1723,7 @@ L(\type\()_8tap_v):
         interleave_1_h  v7,  v16, v17, v18, v19
         interleave_2_s  v5,  v6,  v7,  v16, v17, v18
         uxtl_b          v5,  v6,  v7,  v16
-        mul_mla_8_0     v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
         sqrshrun_b      6,   v30
         st_h            \d_strd, v30, 4
         b.le            0f
@@ -1673,7 +1741,7 @@ L(\type\()_8tap_v):
         load_h          \sr2, \src, \s_strd, v16, v17
         interleave_1_h  v7,  v16, v17
         uxtl_b          v5,  v6,  v7,  v16
-        mul_mla_8_0_4h  v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
         sqrshrun_b      6,   v30
         st_h            \d_strd, v30, 2
 0:
@@ -1698,13 +1766,13 @@ L(\type\()_8tap_v):
         load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         interleave_1_s  v1, v2, v3, v4, v5
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
         shift_store_4   \type, \d_strd, v6
         b.le            0f
         load_s          \sr2, \src, \s_strd, v6, v7
         interleave_1_s  v5, v6, v7
         uxtl_b          v5, v6
-        mul_mla_4       v7, v3, v4, v5, v6, .8h
+        mul_mla_4tap    v7, v3, v4, v5, v6, .8h
         shift_store_4   \type, \d_strd, v7
 0:
         ret
@@ -1729,28 +1797,28 @@ L(\type\()_8tap_v):
         load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
         interleave_1_s  v22, v23, v24, v25, v26
         uxtl_b          v22, v23, v24, v25
-        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+        mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
         shift_store_4   \type, \d_strd, v1, v2
         b.le            0f
         load_s          \sr2,  \src, \s_strd, v27, v16
         subs            \h,  \h,  #2
         interleave_1_s  v26, v27, v16
         uxtl_b          v26, v27
-        mul_mla_8_0     v1,  v20, v21, v22, v23, v24, v25, v26, v27
+        mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
         shift_store_4   \type, \d_strd, v1
         b.le            0f
         load_s          \sr2,  \src, \s_strd, v17, v18
         subs            \h,  \h,  #2
         interleave_1_s  v16, v17, v18
         uxtl_b          v16, v17
-        mul_mla_8_0     v2,  v22, v23, v24, v25, v26, v27, v16, v17
+        mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
         shift_store_4   \type, \d_strd, v2
         b.le            0f
         subs            \h,  \h,  #4
         load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
         interleave_1_s  v18, v19, v20, v21, v22
         uxtl_b          v18, v19, v20, v21
-        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+        mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
         shift_store_4   \type, \d_strd, v1, v2
         b.gt            48b
 0:
@@ -1773,14 +1841,14 @@ L(\type\()_8tap_v):
 
         load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         uxtl_b          v1, v2, v3, v4, v5
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
-        mul_mla_4       v7, v2, v3, v4, v5, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v7, v2, v3, v4, v5, .8h
         shift_store_8   \type, \d_strd, v6, v7
         b.le            0f
         load_8b         \sr2, \src, \s_strd, v6, v7
         uxtl_b          v6, v7
-        mul_mla_4       v1, v3, v4, v5, v6, .8h
-        mul_mla_4       v2, v4, v5, v6, v7, .8h
+        mul_mla_4tap    v1, v3, v4, v5, v6, .8h
+        mul_mla_4tap    v2, v4, v5, v6, v7, .8h
         shift_store_8   \type, \d_strd, v1, v2
 0:
         ret
@@ -1809,32 +1877,32 @@ L(\type\()_8tap_v):
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v23, v24
         uxtl_b          v23, v24
-        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
+        mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_8   \type, \d_strd, v1, v2
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v25, v26
         uxtl_b          v25, v26
-        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
+        mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v27, v16
         uxtl_b          v27, v16
-        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
+        mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
         shift_store_8   \type, \d_strd, v1, v2
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v17, v18
         uxtl_b          v17, v18
-        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
+        mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
         shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
         subs            \h,  \h,  #4
         load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
         uxtl_b          v19, v20, v21, v22
-        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
-        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
+        mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+        mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.gt            88b
 9:
@@ -1882,10 +1950,10 @@ L(\type\()_8tap_v):
         uxtl2           v25.8h, v3.16b
         uxtl2           v26.8h, v4.16b
         uxtl2           v27.8h, v5.16b
-        mul_mla_4       v1,  v16, v17, v18, v19, .8h
-        mul_mla_4       v16, v17, v18, v19, v20, .8h
-        mul_mla_4       v2,  v23, v24, v25, v26, .8h
-        mul_mla_4       v17, v24, v25, v26, v27, .8h
+        mul_mla_4tap    v1,  v16, v17, v18, v19, .8h
+        mul_mla_4tap    v16, v17, v18, v19, v20, .8h
+        mul_mla_4tap    v2,  v23, v24, v25, v26, .8h
+        mul_mla_4tap    v17, v24, v25, v26, v27, .8h
         shift_store_16  \type, \d_strd, v1, v2, v16, v17
         b.le            0f
         load_16b        \sr2, \src, \s_strd, v6,  v7
@@ -1893,25 +1961,25 @@ L(\type\()_8tap_v):
         uxtl            v22.8h, v7.8b
         uxtl2           v28.8h, v6.16b
         uxtl2           v29.8h, v7.16b
-        mul_mla_4       v1,  v18, v19, v20, v21, .8h
-        mul_mla_4       v3,  v19, v20, v21, v22, .8h
-        mul_mla_4       v2,  v25, v26, v27, v28, .8h
-        mul_mla_4       v4,  v26, v27, v28, v29, .8h
+        mul_mla_4tap    v1,  v18, v19, v20, v21, .8h
+        mul_mla_4tap    v3,  v19, v20, v21, v22, .8h
+        mul_mla_4tap    v2,  v25, v26, v27, v28, .8h
+        mul_mla_4tap    v4,  v26, v27, v28, v29, .8h
         shift_store_16  \type, \d_strd, v1, v2, v3, v4
 0:
         ret
 
-L(\type\()_8tap_v_tbl):
-        .hword L(\type\()_8tap_v_tbl) - 1280b
-        .hword L(\type\()_8tap_v_tbl) -  640b
-        .hword L(\type\()_8tap_v_tbl) -  320b
-        .hword L(\type\()_8tap_v_tbl) -  160b
-        .hword L(\type\()_8tap_v_tbl) -   80b
-        .hword L(\type\()_8tap_v_tbl) -   40b
-        .hword L(\type\()_8tap_v_tbl) -   20b
+L(\type\()_\taps\()_v_tbl):
+        .hword L(\type\()_\taps\()_v_tbl) - 1280b
+        .hword L(\type\()_\taps\()_v_tbl) -  640b
+        .hword L(\type\()_\taps\()_v_tbl) -  320b
+        .hword L(\type\()_\taps\()_v_tbl) -  160b
+        .hword L(\type\()_\taps\()_v_tbl) -   80b
+        .hword L(\type\()_\taps\()_v_tbl) -   40b
+        .hword L(\type\()_\taps\()_v_tbl) -   20b
         .hword 0
 
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv):
 4:
         add             \xmy,  x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_8tap_hv_tbl)
+        adr             x9,  L(\type\()_\taps\()_hv_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv):
         addp            v28.4h,  v28.4h,  v29.4h
         addp            v16.4h,  v28.4h,  v28.4h
         srshr           v16.4h,  v16.4h,  #2
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         trn1            v16.2s, v16.2s, v28.2s
         mov             v17.8b, v28.8b
 
 2:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         ext             v18.8b, v17.8b, v28.8b, #4
         smull           v2.4s,  v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv):
         addp            v16.4h,  v28.4h,  v28.4h
         srshr           v16.4h,  v16.4h,  #2
 
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         trn1            v16.2s, v16.2s, v28.2s
         mov             v17.8b, v28.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v18.8b, v17.8b, v28.8b, #4
         mov             v19.8b, v28.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v20.8b, v19.8b, v28.8b, #4
         mov             v21.8b, v28.8b
 
 28:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+        smull           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v20.4h, v1.h[4]
+        smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv):
         smlal           v2.4s,  v21.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
         smlal           v2.4s,  v28.4h, v1.h[7]
+.endif
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv):
 0:
         ret             x15
 
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
         ld1             {v28.8b},  [\sr2], \s_strd
         ld1             {v30.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
@@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2):
         mla             v31.4h,  v30.4h,  v0.h[3]
         srshr           v16.4h,  v31.4h,  #2
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b, v28.8b
         mov             v18.8b, v29.8b
 
 4:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
@@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2):
 480:    // 4x8, 4x16, 4x32 hv
         ld1             {v1.8b},  [\xmy]
         sub             \src, \src, #1
+.ifc \taps, 6tap
+        sub             \sr2, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+.else
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
+.endif
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2):
         mla             v31.4h,  v28.4h,  v0.h[1]
         mla             v31.4h,  v29.4h,  v0.h[2]
         mla             v31.4h,  v30.4h,  v0.h[3]
+.ifc \taps, 6tap
+        srshr           v18.4h,  v31.4h,  #2
+.else
         srshr           v16.4h,  v31.4h,  #2
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b, v28.8b
         mov             v18.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
+.endif
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v19.8b, v28.8b
         mov             v20.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v21.8b, v28.8b
         mov             v22.8b, v29.8b
 
 48:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+        smull           v2.4s,  v18.4h, v1.h[1]
+        smlal           v2.4s,  v19.4h, v1.h[2]
+        smlal           v2.4s,  v20.4h, v1.h[3]
+        smlal           v2.4s,  v21.4h, v1.h[4]
+        smlal           v2.4s,  v22.4h, v1.h[5]
+        smlal           v2.4s,  v28.4h, v1.h[6]
+        smull           v3.4s,  v19.4h, v1.h[1]
+        smlal           v3.4s,  v20.4h, v1.h[2]
+        smlal           v3.4s,  v21.4h, v1.h[3]
+        smlal           v3.4s,  v22.4h, v1.h[4]
+        smlal           v3.4s,  v28.4h, v1.h[5]
+        smlal           v3.4s,  v29.4h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2):
         smlal           v3.4s,  v22.4h, v1.h[5]
         smlal           v3.4s,  v28.4h, v1.h[6]
         smlal           v3.4s,  v29.4h, v1.h[7]
+.endif
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv
         subs            \h,  \h,  #2
@@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2):
         st1             {v3.4h}, [\ds2], \d_strd
 .endif
         b.le            0f
+.ifc \taps, 8tap
         mov             v16.8b,  v18.8b
         mov             v17.8b,  v19.8b
+.endif
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
@@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2):
 0:
         ret             x15
 
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
         ld1             {v26.8b}, [\sr2], \s_strd
         ld1             {v27.8b}, [\src], \s_strd
         uxtl            v26.8h,  v26.8b
@@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4):
         lsl             \d_strd, \d_strd, #1
         lsl             \s_strd, \s_strd, #1
 
-        bl              L(\type\()_8tap_filter_8_first)
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8_first)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v24.16b
         mov             v18.16b, v25.16b
 
 8:
         smull           v2.4s,  v16.4h, v1.h[0]
         smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,  v17.4h, v1.h[0]
         smull2          v5.4s,  v17.8h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4):
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #3
+.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
+.endif
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,  v0.8b
         sxtl            v1.8h,  v1.8b
@@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4):
         lsl             \d_strd, \d_strd, #1
         lsl             \s_strd, \s_strd, #1
 
-        bl              L(\type\()_8tap_filter_8_first)
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+        mov             v18.16b, v16.16b
+.else
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v24.16b
         mov             v18.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
+.endif
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v19.16b, v24.16b
         mov             v20.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v21.16b, v24.16b
         mov             v22.16b, v25.16b
 
 88:
+.ifc \taps, 6tap
+        smull           v2.4s,  v18.4h, v1.h[1]
+        smull2          v3.4s,  v18.8h, v1.h[1]
+        bl              L(\type\()_\taps\()_filter_8)
+        smull           v4.4s,  v19.4h, v1.h[1]
+        smull2          v5.4s,  v19.8h, v1.h[1]
+        smlal           v2.4s,  v19.4h, v1.h[2]
+        smlal2          v3.4s,  v19.8h, v1.h[2]
+        smlal           v4.4s,  v20.4h, v1.h[2]
+        smlal2          v5.4s,  v20.8h, v1.h[2]
+        smlal           v2.4s,  v20.4h, v1.h[3]
+        smlal2          v3.4s,  v20.8h, v1.h[3]
+        smlal           v4.4s,  v21.4h, v1.h[3]
+        smlal2          v5.4s,  v21.8h, v1.h[3]
+        smlal           v2.4s,  v21.4h, v1.h[4]
+        smlal2          v3.4s,  v21.8h, v1.h[4]
+        smlal           v4.4s,  v22.4h, v1.h[4]
+        smlal2          v5.4s,  v22.8h, v1.h[4]
+        smlal           v2.4s,  v22.4h, v1.h[5]
+        smlal2          v3.4s,  v22.8h, v1.h[5]
+        smlal           v4.4s,  v24.4h, v1.h[5]
+        smlal2          v5.4s,  v24.8h, v1.h[5]
+        smlal           v2.4s,  v24.4h, v1.h[6]
+        smlal2          v3.4s,  v24.8h, v1.h[6]
+        smlal           v4.4s,  v25.4h, v1.h[6]
+        smlal2          v5.4s,  v25.8h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,  v17.4h, v1.h[0]
         smull2          v5.4s,  v17.8h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4):
         smlal2          v3.4s,  v24.8h, v1.h[7]
         smlal           v4.4s,  v25.4h, v1.h[7]
         smlal2          v5.4s,  v25.8h, v1.h[7]
+.endif
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
         sqrshrn         v4.4h,  v4.4s,  #\shift_hv
@@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4):
         st1             {v4.8h}, [\ds2], \d_strd
 .endif
         b.le            9f
+.ifc \taps, 8tap
         mov             v16.16b, v18.16b
         mov             v17.16b, v19.16b
+.endif
         mov             v18.16b, v20.16b
         mov             v19.16b, v21.16b
         mov             v20.16b, v22.16b
@@ -2398,15 +2537,33 @@ L(\type\()_8tap_filter_4):
         add             \dst,  \dst,  #8
 .else
         add             \dst,  \dst,  #16
+.endif
+.ifc \taps, 6tap
+        add             \src,  \src,  \s_strd,  lsl #1
 .endif
         b               168b
 0:
         ret             x15
 
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
         ld1             {v28.8b, v29.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
+.ifc \taps, 6tap
+        ext             v24.16b, v28.16b, v29.16b, #(2*1)
+        ext             v25.16b, v28.16b, v29.16b, #(2*2)
+        ext             v26.16b, v28.16b, v29.16b, #(2*3)
+        ext             v27.16b, v28.16b, v29.16b, #(2*4)
+        mul             v16.8h,  v24.8h,  v0.h[1]
+        mla             v16.8h,  v25.8h,  v0.h[2]
+        mla             v16.8h,  v26.8h,  v0.h[3]
+        mla             v16.8h,  v27.8h,  v0.h[4]
+        ext             v24.16b, v28.16b, v29.16b, #(2*5)
+        ext             v25.16b, v28.16b, v29.16b, #(2*6)
+        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        mla             v16.8h,  v24.8h,  v0.h[5]
+        mla             v16.8h,  v25.8h,  v0.h[6]
+.else   // 8tap
         mul             v16.8h,  v28.8h,  v0.h[0]
         ext             v24.16b, v28.16b, v29.16b, #(2*1)
         ext             v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first):
         mla             v16.8h,  v24.8h,  v0.h[5]
         mla             v16.8h,  v25.8h,  v0.h[6]
         mla             v16.8h,  v26.8h,  v0.h[7]
+.endif
         srshr           v16.8h,  v16.8h,  #2
         ret
 
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
         ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
         ld1             {v30.8b, v31.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
         uxtl            v30.8h,  v30.8b
         uxtl            v31.8h,  v31.8b
+.ifc \taps, 6tap
+        ext             v26.16b, v28.16b, v29.16b, #2
+        ext             v27.16b, v30.16b, v31.16b, #2
+        mul             v24.8h,  v26.8h,  v0.h[1]
+        mul             v25.8h,  v27.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mla             v24.8h,  v26.8h,  v0.h[\i]
+        mla             v25.8h,  v27.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v24.8h,  v28.8h,  v0.h[0]
         mul             v25.8h,  v30.8h,  v0.h[0]
 .irpc i, 1234567
@@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8):
         mla             v24.8h,  v26.8h,  v0.h[\i]
         mla             v25.8h,  v27.8h,  v0.h[\i]
 .endr
+.endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
         ret
 
-L(\type\()_8tap_hv_tbl):
-        .hword L(\type\()_8tap_hv_tbl) - 1280b
-        .hword L(\type\()_8tap_hv_tbl) -  640b
-        .hword L(\type\()_8tap_hv_tbl) -  320b
-        .hword L(\type\()_8tap_hv_tbl) -  160b
-        .hword L(\type\()_8tap_hv_tbl) -   80b
-        .hword L(\type\()_8tap_hv_tbl) -   40b
-        .hword L(\type\()_8tap_hv_tbl) -   20b
+L(\type\()_\taps\()_hv_tbl):
+        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+        .hword L(\type\()_\taps\()_hv_tbl) -  640b
+        .hword L(\type\()_\taps\()_hv_tbl) -  320b
+        .hword L(\type\()_\taps\()_hv_tbl) -  160b
+        .hword L(\type\()_\taps\()_hv_tbl) -   80b
+        .hword L(\type\()_\taps\()_hv_tbl) -   40b
+        .hword L(\type\()_\taps\()_hv_tbl) -   20b
         .hword 0
 endfunc
+.endm
 
 
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
 function \type\()_bilin_8bpc_neon, export=1
         dup             v1.16b, \mx
         dup             v3.16b, \my
@@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl):
 endfunc
 .endm
 
-filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn    prep, regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    prep, smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    prep, sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    prep, sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    prep, sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  8tap
+
+make_8tap_fn    prep, regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    prep, regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    prep, smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    prep, smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
 
 .macro load_filter_row dst, src, inc
         asr             w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
index 1bfb12ebb3..576fab158a 100644
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@ endfunc
         sub             \r3\wd,  \r3\wd,  \c\wd
 .endif
 .endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
         smull           \d\().4s,  \s0\().4h,  v0.h[0]
         smlal           \d\().4s,  \s1\().4h,  v0.h[1]
         smlal           \d\().4s,  \s2\().4h,  v0.h[2]
         smlal           \d\().4s,  \s3\().4h,  v0.h[3]
 .endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
         smull2          \d\().4s,  \s0\().8h,  v0.h[0]
         smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
         smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
         smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
 .endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
+        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
+        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
+        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
+        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
         smull           \d\().4s,  \s0\().4h,  v0.h[0]
         smlal           \d\().4s,  \s1\().4h,  v0.h[1]
         smlal           \d\().4s,  \s2\().4h,  v0.h[2]
@@ -1396,7 +1412,7 @@ endfunc
         smlal           \d\().4s,  \s6\().4h,  v0.h[6]
         smlal           \d\().4s,  \s7\().4h,  v0.h[7]
 .endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
         smull2          \d\().4s,  \s0\().8h,  v0.h[0]
         smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
         smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
@@ -1499,11 +1515,11 @@ endfunc
         st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
 .endm
 
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
 function \op\()_8tap_\type\()_16bpc_neon, export=1
         mov             w9,  \type_h
         mov             w10, \type_v
-        b               \op\()_8tap_neon
+        b               \op\()_\taps\()_neon
 endfunc
 .endm
 
@@ -1512,18 +1528,8 @@ endfunc
 #define SMOOTH  ((1*15<<7)|4*15)
 #define SHARP   ((2*15<<7)|3*15)
 
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular,        REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
-make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
-make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
-make_8tap_fn \type, sharp,          SHARP,   SHARP
-make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
-make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
 .ifc \bdmax, w8
         ldr             w8,  [sp]
 .endif
@@ -1547,12 +1553,12 @@ function \type\()_8tap_neon
         add             w13, w12, \bdmax       // 6 + intermediate_bits
         sub             w12, w12, \bdmax       // 6 - intermediate_bits
         movrel          x11, X(mc_subpel_filters), -8
-        b.ne            L(\type\()_8tap_h)
+        b.ne            L(\type\()_\taps\()_h)
         tst             \my, #(0x7f << 14)
-        b.ne            L(\type\()_8tap_v)
+        b.ne            L(\type\()_\taps\()_v)
         b               \type\()_neon
 
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
         cmp             \w,   #4
         ubfx            w10,  \mx, #7, #7
         and             \mx,  \mx, #0x7f
@@ -1561,9 +1567,9 @@ L(\type\()_8tap_h):
 4:
         tst             \my,  #(0x7f << 14)
         add             \xmx, x11, \mx, uxtw #3
-        b.ne            L(\type\()_8tap_hv)
+        b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x10, L(\type\()_8tap_h_tbl)
+        adr             x10, L(\type\()_\taps\()_h_tbl)
         dup             v30.4s,  w12           // 6 - intermediate_bits
         ldrh            w9,  [x10, x9, lsl #1]
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@ L(\type\()_8tap_h):
         mov             \mx, \w
 
 8:
+.ifc \taps, 6tap
+        ext             v24.16b, v16.16b, v17.16b, #2
+        ext             v25.16b, v20.16b, v21.16b, #2
+        smull           v18.4s,  v24.4h,  v0.h[1]
+        smull2          v19.4s,  v24.8h,  v0.h[1]
+        smull           v22.4s,  v25.4h,  v0.h[1]
+        smull2          v23.4s,  v25.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+        smlal           v18.4s,  v24.4h,  v0.h[\i]
+        smlal2          v19.4s,  v24.8h,  v0.h[\i]
+        smlal           v22.4s,  v25.4h,  v0.h[\i]
+        smlal2          v23.4s,  v25.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v18.4s,  v16.4h,  v0.h[0]
         smull2          v19.4s,  v16.8h,  v0.h[0]
         smull           v22.4s,  v20.4h,  v0.h[0]
@@ -1694,6 +1716,7 @@ L(\type\()_8tap_h):
         smlal           v22.4s,  v25.4h,  v0.h[\i]
         smlal2          v23.4s,  v25.8h,  v0.h[\i]
 .endr
+.endif
         subs            \mx, \mx, #8
         srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@ L(\type\()_8tap_h):
         b.gt            81b
         ret
 
-L(\type\()_8tap_h_tbl):
-        .hword L(\type\()_8tap_h_tbl) - 1280b
-        .hword L(\type\()_8tap_h_tbl) -  640b
-        .hword L(\type\()_8tap_h_tbl) -  320b
-        .hword L(\type\()_8tap_h_tbl) -  160b
-        .hword L(\type\()_8tap_h_tbl) -   80b
-        .hword L(\type\()_8tap_h_tbl) -   40b
-        .hword L(\type\()_8tap_h_tbl) -   20b
+L(\type\()_\taps\()_h_tbl):
+        .hword L(\type\()_\taps\()_h_tbl) - 1280b
+        .hword L(\type\()_\taps\()_h_tbl) -  640b
+        .hword L(\type\()_\taps\()_h_tbl) -  320b
+        .hword L(\type\()_\taps\()_h_tbl) -  160b
+        .hword L(\type\()_\taps\()_h_tbl) -   80b
+        .hword L(\type\()_\taps\()_h_tbl) -   40b
+        .hword L(\type\()_\taps\()_h_tbl) -   20b
         .hword 0
 
 
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -1758,7 +1781,7 @@ L(\type\()_8tap_v):
         dup             v30.4s,  w12           // 6 - intermediate_bits
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        adr             x10, L(\type\()_8tap_v_tbl)
+        adr             x10, L(\type\()_\taps\()_v_tbl)
         ldrh            w9,  [x10, x9, lsl #1]
 .ifc \type, prep
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@ L(\type\()_8tap_v):
         load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         interleave_1_s  v1,  v2,  v3,  v4,  v5
         b.gt            24f
-        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        smull_smlal_4tap v6, v1,  v2,  v3,  v4
         sqrshrun_h      6,   v6
         umin_h          v31, .8h, v6
         st_s            \d_strd, v6, 2
@@ -1794,8 +1817,8 @@ L(\type\()_8tap_v):
 24:     // 2x4 v
         load_s          \sr2, \src, \s_strd, v6, v7
         interleave_1_s  v5,  v6,  v7
-        smull_smlal_4   v16, v1,  v2,  v3,  v4
-        smull_smlal_4   v17, v3,  v4,  v5,  v6
+        smull_smlal_4tap v16, v1, v2, v3, v4
+        smull_smlal_4tap v17, v3, v4, v5, v6
         sqrshrun_h      6,   v16, v17
         umin_h          v31, .8h, v16
         st_s            \d_strd, v16, 4
@@ -1817,8 +1840,8 @@ L(\type\()_8tap_v):
         subs            \h,  \h,  #4
         load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
         interleave_1_s  v7,  v16, v17, v18, v19
-        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
-        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
+        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
+        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
         sqrshrun_h      6,   v24, v25
         umin_h          v31, .8h, v24
         st_s            \d_strd, v24, 4
@@ -1836,7 +1859,7 @@ L(\type\()_8tap_v):
 26:
         load_s          \sr2, \src, \s_strd, v16, v17
         interleave_1_s  v7,  v16, v17
-        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
         sqrshrun_h      6,   v24
         umin_h          v31, .4h, v24
         st_s            \d_strd, v24, 2
@@ -1860,13 +1883,13 @@ L(\type\()_8tap_v):
         sxtl            v0.8h,   v0.8b
 
         load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        smull_smlal_4   v6,  v1,  v2,  v3,  v4
-        smull_smlal_4   v7,  v2,  v3,  v4,  v5
+        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
+        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
         shift_store_4   \type, \d_strd, v6, v7
         b.le            0f
         load_4h         \sr2, \src, \s_strd, v6, v7
-        smull_smlal_4   v1,  v3,  v4,  v5,  v6
-        smull_smlal_4   v2,  v4,  v5,  v6,  v7
+        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
+        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
         shift_store_4   \type, \d_strd, v1, v2
 0:
         ret
@@ -1885,10 +1908,10 @@ L(\type\()_8tap_v):
 48:
         subs            \h,  \h,  #4
         load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
-        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_4   \type, \d_strd, v1, v2, v3, v4
         b.le            0f
         cmp             \h,  #2
@@ -1903,8 +1926,8 @@ L(\type\()_8tap_v):
         b               48b
 46:
         load_4h         \sr2, \src, \s_strd, v23, v24
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_4   \type, \d_strd, v1, v2
 0:
         ret
@@ -1925,17 +1948,17 @@ L(\type\()_8tap_v):
         sxtl            v0.8h,   v0.8b
 
         load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        smull_smlal_4   v16, v1,  v2,  v3,  v4
-        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
-        smull_smlal_4   v18, v2,  v3,  v4,  v5
-        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
+        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
+        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
+        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
+        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
         shift_store_8   \type, \d_strd, v16, v17, v18, v19
         b.le            0f
         load_8h         \sr2, \src, \s_strd, v6, v7
-        smull_smlal_4   v16, v3,  v4,  v5,  v6
-        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
-        smull_smlal_4   v18, v4,  v5,  v6,  v7
-        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
+        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
+        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
+        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
+        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
         shift_store_8   \type, \d_strd, v16, v17, v18, v19
 0:
         ret
@@ -1962,18 +1985,18 @@ L(\type\()_8tap_v):
 88:
         subs            \h,  \h,  #2
         load_8h         \sr2, \src, \s_strd, v23, v24
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
-        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
+        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.le            9f
         subs            \h,  \h,  #2
         load_8h         \sr2, \src, \s_strd, v25, v26
-        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
-        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
+        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
+        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.le            9f
         mov             v16.16b, v20.16b
@@ -2013,10 +2036,10 @@ L(\type\()_8tap_v):
 16:
         load_16h        \src, \src, \s_strd, v22, v23
         subs            \h,  \h,  #1
-        smull_smlal_4   v1,  v16, v18, v20, v22
-        smull2_smlal2_4 v2,  v16, v18, v20, v22
-        smull_smlal_4   v3,  v17, v19, v21, v23
-        smull2_smlal2_4 v4,  v17, v19, v21, v23
+        smull_smlal_4tap   v1, v16, v18, v20, v22
+        smull2_smlal2_4tap v2, v16, v18, v20, v22
+        smull_smlal_4tap   v3, v17, v19, v21, v23
+        smull2_smlal2_4tap v4, v17, v19, v21, v23
         shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
         b.le            0f
         mov             v16.16b, v18.16b
@@ -2029,17 +2052,17 @@ L(\type\()_8tap_v):
 0:
         ret
 
-L(\type\()_8tap_v_tbl):
-        .hword L(\type\()_8tap_v_tbl) - 1280b
-        .hword L(\type\()_8tap_v_tbl) -  640b
-        .hword L(\type\()_8tap_v_tbl) -  320b
-        .hword L(\type\()_8tap_v_tbl) -  160b
-        .hword L(\type\()_8tap_v_tbl) -   80b
-        .hword L(\type\()_8tap_v_tbl) -   40b
-        .hword L(\type\()_8tap_v_tbl) -   20b
+L(\type\()_\taps\()_v_tbl):
+        .hword L(\type\()_\taps\()_v_tbl) - 1280b
+        .hword L(\type\()_\taps\()_v_tbl) -  640b
+        .hword L(\type\()_\taps\()_v_tbl) -  320b
+        .hword L(\type\()_\taps\()_v_tbl) -  160b
+        .hword L(\type\()_\taps\()_v_tbl) -   80b
+        .hword L(\type\()_\taps\()_v_tbl) -   40b
+        .hword L(\type\()_\taps\()_v_tbl) -   20b
         .hword 0
 
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv):
 4:
         add             \xmy, x11, \my, uxtw #3
 
-        adr             x10, L(\type\()_8tap_hv_tbl)
+        adr             x10, L(\type\()_\taps\()_hv_tbl)
         dup             v30.4s,  w12           // 6 - intermediate_bits
         ldrh            w9,  [x10, x9, lsl #1]
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv):
         addp            v27.4s,  v27.4s,  v28.4s
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         // The intermediates from the horizontal pass fit in 16 bit without
         // any bias; we could just as well keep them as .4s, but narrowing
         // them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv):
         mov             v17.8b,  v24.8b
 
 2:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         ext             v18.8b,  v17.8b,  v24.8b,  #4
         smull           v2.4s,   v16.4h,  v1.h[0]
@@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv):
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53).
 
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         xtn             v16.4h,  v16.4s
         trn1            v16.2s,  v16.2s,  v24.2s
         mov             v17.8b,  v24.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v18.8b,  v17.8b,  v24.8b,  #4
         mov             v19.8b,  v24.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v20.8b,  v19.8b,  v24.8b,  #4
         mov             v21.8b,  v24.8b
 
 28:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v22.8b,  v21.8b,  v24.8b,  #4
+.ifc \taps, 6tap
+        smull           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+.else   // 8tap
         smull           v3.4s,   v16.4h,  v1.h[0]
         smlal           v3.4s,   v17.4h,  v1.h[1]
         smlal           v3.4s,   v18.4h,  v1.h[2]
@@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv):
         smlal           v3.4s,   v21.4h,  v1.h[5]
         smlal           v3.4s,   v22.4h,  v1.h[6]
         smlal           v3.4s,   v24.4h,  v1.h[7]
+.endif
 
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v3.4h,   v3.4s
@@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv):
 0:
         ret             x15
 
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
         ld1             {v25.8h},  [\sr2], \s_strd
         ld1             {v27.8h},  [\src], \s_strd
         ext             v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2):
         // (at the cost of a smaller slowdown on in-order cores such as A53).
         xtn             v16.4h,  v16.4s
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b,  v24.8b
         mov             v18.8b,  v25.8b
 
 4:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         smull           v2.4s,   v16.4h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
         smlal           v2.4s,   v18.4h,  v1.h[2]
@@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2):
 480:    // 4x8, 4x16, 4x32 hv
         ld1             {v1.8b},  [\xmy]
         sub             \src, \src, #2
+.ifc \taps, 6tap
+        sub             \sr2, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+.else
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
+.endif
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2):
         // any bias; we could just as well keep them as .4s, but narrowing
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+        xtn             v18.4h,  v16.4s
+.else
         xtn             v16.4h,  v16.4s
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b,  v24.8b
         mov             v18.8b,  v25.8b
-        bl              L(\type\()_8tap_filter_4)
+.endif
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v19.8b,  v24.8b
         mov             v20.8b,  v25.8b
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v21.8b,  v24.8b
         mov             v22.8b,  v25.8b
 
 48:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+        smull           v3.4s,   v18.4h,  v1.h[1]
+        smlal           v3.4s,   v19.4h,  v1.h[2]
+        smlal           v3.4s,   v20.4h,  v1.h[3]
+        smlal           v3.4s,   v21.4h,  v1.h[4]
+        smlal           v3.4s,   v22.4h,  v1.h[5]
+        smlal           v3.4s,   v24.4h,  v1.h[6]
+        smull           v4.4s,   v19.4h,  v1.h[1]
+        smlal           v4.4s,   v20.4h,  v1.h[2]
+        smlal           v4.4s,   v21.4h,  v1.h[3]
+        smlal           v4.4s,   v22.4h,  v1.h[4]
+        smlal           v4.4s,   v24.4h,  v1.h[5]
+        smlal           v4.4s,   v25.4h,  v1.h[6]
+.else   // 8tap
         smull           v3.4s,   v16.4h,  v1.h[0]
         smlal           v3.4s,   v17.4h,  v1.h[1]
         smlal           v3.4s,   v18.4h,  v1.h[2]
@@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2):
         smlal           v4.4s,   v22.4h,  v1.h[5]
         smlal           v4.4s,   v24.4h,  v1.h[6]
         smlal           v4.4s,   v25.4h,  v1.h[7]
+.endif
 .ifc \type, put
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2):
         st1             {v3.d}[0], [\dst], \d_strd
         st1             {v3.d}[1], [\ds2], \d_strd
         b.le            0f
+.ifc \taps, 8tap
         mov             v16.8b,  v18.8b
         mov             v17.8b,  v19.8b
+.endif
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
@@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2):
 0:
         ret             x15
 
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
         ld1             {v24.8h}, [\sr2], \s_strd
         ld1             {v25.8h}, [\src], \s_strd
         ext             v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4):
         // and conserves register space (no need to clobber v8-v15).
         uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v23.16b
         mov             v18.16b, v24.16b
 
 8:
         smull           v2.4s,   v16.4h,  v1.h[0]
         smull2          v3.4s,   v16.8h,  v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,   v17.4h,  v1.h[0]
         smull2          v5.4s,   v17.8h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
@@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4):
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #6
+.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
+.endif
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
@@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4):
         lsl             \s_strd, \s_strd, #1
 
         ld1             {v27.8h, v28.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        ext             v26.16b, v27.16b, v28.16b, #2
+        smull           v24.4s,  v26.4h,  v0.h[1]
+        smull2          v25.4s,  v26.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v24.4s,  v27.4h,  v0.h[0]
         smull2          v25.4s,  v27.8h,  v0.h[0]
 .irpc i, 1234567
@@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4):
         smlal           v24.4s,  v26.4h,  v0.h[\i]
         smlal2          v25.4s,  v26.8h,  v0.h[\i]
 .endr
+.endif
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         // The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4):
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53),
         // and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
+.else
         uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v23.16b
         mov             v18.16b, v24.16b
-        bl              L(\type\()_8tap_filter_8)
+.endif
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v19.16b, v23.16b
         mov             v20.16b, v24.16b
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v21.16b, v23.16b
         mov             v22.16b, v24.16b
 
 88:
+.ifc \taps, 6tap
+        smull           v2.4s,   v18.4h,  v1.h[1]
+        smull2          v3.4s,   v18.8h,  v1.h[1]
+        bl              L(\type\()_\taps\()_filter_8)
+        smull           v4.4s,   v19.4h,  v1.h[1]
+        smull2          v5.4s,   v19.8h,  v1.h[1]
+        smlal           v2.4s,   v19.4h,  v1.h[2]
+        smlal2          v3.4s,   v19.8h,  v1.h[2]
+        smlal           v4.4s,   v20.4h,  v1.h[2]
+        smlal2          v5.4s,   v20.8h,  v1.h[2]
+        smlal           v2.4s,   v20.4h,  v1.h[3]
+        smlal2          v3.4s,   v20.8h,  v1.h[3]
+        smlal           v4.4s,   v21.4h,  v1.h[3]
+        smlal2          v5.4s,   v21.8h,  v1.h[3]
+        smlal           v2.4s,   v21.4h,  v1.h[4]
+        smlal2          v3.4s,   v21.8h,  v1.h[4]
+        smlal           v4.4s,   v22.4h,  v1.h[4]
+        smlal2          v5.4s,   v22.8h,  v1.h[4]
+        smlal           v2.4s,   v22.4h,  v1.h[5]
+        smlal2          v3.4s,   v22.8h,  v1.h[5]
+        smlal           v4.4s,   v23.4h,  v1.h[5]
+        smlal2          v5.4s,   v23.8h,  v1.h[5]
+        smlal           v2.4s,   v23.4h,  v1.h[6]
+        smlal2          v3.4s,   v23.8h,  v1.h[6]
+        smlal           v4.4s,   v24.4h,  v1.h[6]
+        smlal2          v5.4s,   v24.8h,  v1.h[6]
+.else   // 8tap
         smull           v2.4s,   v16.4h,  v1.h[0]
         smull2          v3.4s,   v16.8h,  v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,   v17.4h,  v1.h[0]
         smull2          v5.4s,   v17.8h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
@@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4):
         smlal2          v3.4s,   v23.8h,  v1.h[7]
         smlal           v4.4s,   v24.4h,  v1.h[7]
         smlal2          v5.4s,   v24.8h,  v1.h[7]
+.endif
 .ifc \type, put
         srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4):
         st1             {v2.8h}, [\dst], \d_strd
         st1             {v3.8h}, [\ds2], \d_strd
         b.le            9f
+.ifc \taps, 8tap
         mov             v16.16b, v18.16b
         mov             v17.16b, v19.16b
+.endif
         mov             v18.16b, v20.16b
         mov             v19.16b, v21.16b
         mov             v20.16b, v22.16b
@@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4):
         mov             \h,  \my
         add             \src,  \src,  #16
         add             \dst,  \dst,  #16
+.ifc \taps, 6tap
+        add             \src,  \src,  \s_strd,  lsl #1
+.endif
         b               168b
 0:
         ret             x15
 
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
         ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
         ld1             {v6.8h, v7.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        ext             v23.16b, v4.16b,  v5.16b,  #2
+        ext             v24.16b, v6.16b,  v7.16b,  #2
+        smull           v25.4s,  v23.4h,  v0.h[1]
+        smull2          v26.4s,  v23.8h,  v0.h[1]
+        smull           v27.4s,  v24.4h,  v0.h[1]
+        smull2          v28.4s,  v24.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        smlal           v25.4s,  v23.4h,  v0.h[\i]
+        smlal2          v26.4s,  v23.8h,  v0.h[\i]
+        smlal           v27.4s,  v24.4h,  v0.h[\i]
+        smlal2          v28.4s,  v24.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v25.4s,  v4.4h,   v0.h[0]
         smull2          v26.4s,  v4.8h,   v0.h[0]
         smull           v27.4s,  v6.4h,   v0.h[0]
@@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8):
         smlal           v27.4s,  v24.4h,  v0.h[\i]
         smlal2          v28.4s,  v24.8h,  v0.h[\i]
 .endr
+.endif
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8):
         uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
         ret
 
-L(\type\()_8tap_hv_tbl):
-        .hword L(\type\()_8tap_hv_tbl) - 1280b
-        .hword L(\type\()_8tap_hv_tbl) -  640b
-        .hword L(\type\()_8tap_hv_tbl) -  320b
-        .hword L(\type\()_8tap_hv_tbl) -  160b
-        .hword L(\type\()_8tap_hv_tbl) -   80b
-        .hword L(\type\()_8tap_hv_tbl) -   40b
-        .hword L(\type\()_8tap_hv_tbl) -   20b
+L(\type\()_\taps\()_hv_tbl):
+        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+        .hword L(\type\()_\taps\()_hv_tbl) -  640b
+        .hword L(\type\()_\taps\()_hv_tbl) -  320b
+        .hword L(\type\()_\taps\()_hv_tbl) -  160b
+        .hword L(\type\()_\taps\()_hv_tbl) -   80b
+        .hword L(\type\()_\taps\()_hv_tbl) -   40b
+        .hword L(\type\()_\taps\()_hv_tbl) -   20b
         .hword 0
 endfunc
+.endm
 
 
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
 function \type\()_bilin_16bpc_neon, export=1
 .ifc \bdmax, w8
         ldr             w8,  [sp]
@@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl):
 endfunc
 .endm
 
-filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
 
 .macro load_filter_row dst, src, inc
         asr             w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
index 3a6cf900a9..7bef9243fb 100644
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -208,60 +208,66 @@ L(renorm):
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
-        mvn             x7,  x7                // ~dif
-        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
 L(renorm2):
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
-        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
         str             w4,  [x0, #RNG]
-        mvn             x7,  x7                // ~dif
-        b.hs            9f
+        b.hs            4f
 
         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
-        cmp             x5,  x4
-        b.gt            2f
-
-        ldr             x3,  [x3]              // next_bits
-        add             w8,  w6,  #23          // shift_bits = cnt + 23
-        add             w6,  w6,  #16          // cnt += 16
-        rev             x3,  x3                // next_bits = bswap(next_bits)
-        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             w8,  w8,  #24          // shift_bits &= 24
-        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
-        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
-        str             x5,  [x0, #BUF_POS]
-        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
-        mov             w4,  #48
-        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
-        eor             x7,  x7,  x3           // dif ^= next_bits
-        b               9f
-
-2:      // refill_eob
-        mov             w14, #40
-        sub             w5,  w14, w6           // c = 40 - cnt
-3:
-        cmp             x3,  x4
-        b.ge            4f
-        ldrb            w8,  [x3], #1
-        lsl             x8,  x8,  x5
-        eor             x7,  x7,  x8
-        subs            w5,  w5,  #8
-        b.ge            3b
-
-4:      // refill_eob_end
+        subs            x5,  x5,  x4
+        b.hi            6f
+
+        ldr             x8,  [x3]              // next_bits
+        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
+        mvn             x8,  x8
+        neg             w5,  w4
+        rev             x8,  x8                // next_bits = bswap(next_bits)
+        lsr             w5,  w5,  #3           // num_bytes_read
+        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
+
+2:      // refill_end
+        add             x3,  x3,  x5
+        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
         str             x3,  [x0, #BUF_POS]
-        sub             w6,  w14, w5           // cnt = 40 - c
 
-9:
+3:      // refill_end2
+        orr             x7,  x7,  x8           // dif |= next_bits
+
+4:      // end
         str             w6,  [x0, #CNT]
         str             x7,  [x0, #DIF]
 
         mov             w0,  w15
         add             sp,  sp,  #48
         ret
+
+5:      // pad_with_ones
+        add             w8,  w6,  #-16
+        ror             x8,  x8,  x8
+        b               3b
+
+6:      // refill_eob
+        cmp             x3,  x4
+        b.hs            5b
+
+        ldr             x8,  [x4, #-8]
+        lsl             w5,  w5,  #3
+        lsr             x8,  x8,  x5
+        add             w5,  w6,  #-48
+        mvn             x8,  x8
+        sub             w4,  w4,  w3           // num_bytes_left
+        rev             x8,  x8
+        lsr             x8,  x8,  x5
+        neg             w5,  w5
+        lsr             w5,  w5,  #3
+        cmp             w5,  w4
+        csel            w5,  w5,  w4,  lo      // num_bytes_read
+        b               2b
 endfunc
 
 function msac_decode_symbol_adapt8_neon, export=1
@@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
-        mvn             x7,  x7                // ~dif
-        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
-        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
         str             w4,  [x0, #RNG]
         dup             v3.4h,   w4
-        mvn             x7,  x7                // ~dif
-        b.hs            9f
+        b.hs            5f
 
         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
-        cmp             x5,  x4
-        b.gt            2f
-
-        ldr             x3,  [x3]              // next_bits
-        add             w8,  w6,  #23          // shift_bits = cnt + 23
-        add             w6,  w6,  #16          // cnt += 16
-        rev             x3,  x3                // next_bits = bswap(next_bits)
-        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             w8,  w8,  #24          // shift_bits &= 24
-        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
-        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
-        str             x5,  [x0, #BUF_POS]
-        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
-        mov             w4,  #48
-        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
-        eor             x7,  x7,  x3           // dif ^= next_bits
-        b               9f
-
-2:      // refill_eob
-        mov             w14, #40
-        sub             w5,  w14, w6           // c = 40 - cnt
-3:
-        cmp             x3,  x4
-        b.ge            4f
-        ldrb            w8,  [x3], #1
-        lsl             x8,  x8,  x5
-        eor             x7,  x7,  x8
-        subs            w5,  w5,  #8
-        b.ge            3b
-
-4:      // refill_eob_end
+        subs            x5,  x5,  x4
+        b.hi            7f
+
+        ldr             x8,  [x3]              // next_bits
+        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
+        mvn             x8,  x8
+        neg             w5,  w4
+        rev             x8,  x8                // next_bits = bswap(next_bits)
+        lsr             w5,  w5,  #3           // num_bytes_read
+        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
+
+3:      // refill_end
+        add             x3,  x3,  x5
+        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
         str             x3,  [x0, #BUF_POS]
-        sub             w6,  w14, w5           // cnt = 40 - c
 
-9:
+4:      // refill_end2
+        orr             x7,  x7,  x8           // dif |= next_bits
+
+5:      // end
         lsl             w15, w15, #1
         sub             w15, w15, #5
         lsr             x12, x7,  #48
@@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1
         str             x7,  [x0, #DIF]
         lsr             w0,  w13, #1
         ret
+
+6:      // pad_with_ones
+        add             w8,  w6,  #-16
+        ror             x8,  x8,  x8
+        b               4b
+
+7:      // refill_eob
+        cmp             x3,  x4
+        b.hs            6b
+
+        ldr             x8,  [x4, #-8]
+        lsl             w5,  w5,  #3
+        lsr             x8,  x8,  x5
+        add             w5,  w6,  #-48
+        mvn             x8,  x8
+        sub             w4,  w4,  w3           // num_bytes_left
+        rev             x8,  x8
+        lsr             x8,  x8,  x5
+        neg             w5,  w5
+        lsr             w5,  w5,  #3
+        cmp             w5,  w4
+        csel            w5,  w5,  w4,  lo      // num_bytes_read
+        b               3b
 endfunc
 
 function msac_decode_bool_equi_neon, export=1
@@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc
@@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc
@@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1
         ldr             w10, [x0, #ALLOW_UPDATE_CDF]
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
 
         cbz             w10, L(renorm2)
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
index 9013fd4b1e..1b3f319ce5 100644
--- a/third_party/dav1d/src/arm/64/util.S
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -32,6 +32,10 @@
 #include "config.h"
 #include "src/arm/asm.S"
 
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
 .macro  movrel rd, val, offset=0
 #if defined(__APPLE__)
   .if \offset < 0
@@ -51,6 +55,10 @@
         adrp            \rd, \val+(\offset)
         add             \rd, \rd, :lo12:\val+(\offset)
   .endif
+#elif __has_feature(hwaddress_sanitizer)
+        adrp            \rd, :pg_hi21_nc:\val+(\offset)
+        movk            \rd, #:prel_g3:\val+0x100000000
+        add             \rd, \rd, :lo12:\val+(\offset)
 #elif defined(PIC)
         adrp            \rd, \val+(\offset)
         add             \rd, \rd, :lo12:\val+(\offset)
@@ -149,6 +157,35 @@
         trn2            \r7\().2d,  \t9\().2d,  \r7\().2d
 .endm
 
+.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
+        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
+        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
+        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
+
+        trn1            \o0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \o4\().2d,  \r3\().2d,  \r4\().2d
+        trn1            \o1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \o5\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \o6\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \o2\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \o3\().2d,  \t9\().2d,  \r7\().2d
+        trn2            \o7\().2d,  \t9\().2d,  \r7\().2d
+.endm
+
 .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
         trn1            \t8\().16b, \r0\().16b, \r1\().16b
         trn2            \t9\().16b, \r0\().16b, \r1\().16b
@@ -226,4 +263,16 @@
         trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
 .endm
 
+.macro  transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \o0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \o2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \o1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \o3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
 #endif /* DAV1D_SRC_ARM_64_UTIL_S */
-- 
cgit v1.2.3