summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/mc.S
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/dav1d/src/arm/64/mc.S411
1 files changed, 305 insertions, 106 deletions
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
index 9f7b4e7a89..3df0393c3a 100644
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@ endfunc
uxtl \r6\().8h, \r6\().8b
.endif
.endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
mul \d\wd, \s0\wd, v0.h[0]
mla \d\wd, \s1\wd, v0.h[1]
mla \d\wd, \s2\wd, v0.h[2]
@@ -1163,7 +1163,51 @@ endfunc
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().4h, \s0\().4h, v0.h[0]
mla \d0\().4h, \s1\().4h, v0.h[1]
mla \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@ endfunc
mla \d0\().4h, \s6\().4h, v0.h[6]
mla \d0\().4h, \s7\().4h, v0.h[7]
.endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@ endfunc
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
.endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@ endfunc
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@ endfunc
.endif
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1328,18 +1372,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@@ -1354,12 +1388,12 @@ function \type\()_8tap_neon
tst \mx, #(0x7f << 14)
sub w8, w8, #24
movrel x10, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x10, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x9, L(\type\()_8tap_h_tbl)
+ adr x9, L(\type\()_\taps\()_h_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1471,6 +1505,18 @@ L(\type\()_8tap_h):
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
+.ifc \taps, 6tap
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v23.16b, v20.16b, v21.16b, #2
+ mul v18.8h, v19.8h, v0.h[1]
+ mul v22.8h, v23.8h, v0.h[1]
+.irpc i, 23456
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v18.8h, v16.8h, v0.h[0]
mul v22.8h, v20.8h, v0.h[0]
.irpc i, 1234567
@@ -1479,6 +1525,7 @@ L(\type\()_8tap_h):
mla v18.8h, v19.8h, v0.h[\i]
mla v22.8h, v23.8h, v0.h[\i]
.endr
+.endif
subs \h, \h, #2
srshr v18.8h, v18.8h, #2
srshr v22.8h, v22.8h, #2
@@ -1523,6 +1570,26 @@ L(\type\()_8tap_h):
uxtl v22.8h, v22.8b
16:
+.ifc \taps, 6tap
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v29.16b, v17.16b, v18.16b, #2
+ ext v30.16b, v20.16b, v21.16b, #2
+ ext v31.16b, v21.16b, v22.16b, #2
+ mul v24.8h, v28.8h, v0.h[1]
+ mul v25.8h, v29.8h, v0.h[1]
+ mul v26.8h, v30.8h, v0.h[1]
+ mul v27.8h, v31.8h, v0.h[1]
+.irpc i, 23456
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v16.8h, v0.h[0]
mul v25.8h, v17.8h, v0.h[0]
mul v26.8h, v20.8h, v0.h[0]
@@ -1537,6 +1604,7 @@ L(\type\()_8tap_h):
mla v26.8h, v30.8h, v0.h[\i]
mla v27.8h, v31.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
srshr v26.8h, v26.8h, #2
@@ -1575,18 +1643,18 @@ L(\type\()_8tap_h):
b.gt 161b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1595,7 +1663,7 @@ L(\type\()_8tap_v):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_v_tbl)
+ adr x9, L(\type\()_\taps\()_v_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1620,7 +1688,7 @@ L(\type\()_8tap_v):
interleave_1_h v1, v2, v3, v4, v5
b.gt 24f
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .4h
+ mul_mla_4tap v6, v1, v2, v3, v4, .4h
sqrshrun_b 6, v6
st_h \d_strd, v6, 2
ret
@@ -1630,7 +1698,7 @@ L(\type\()_8tap_v):
interleave_1_h v5, v6, v7
interleave_2_s v1, v2, v3, v4, v5, v6
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
sqrshrun_b 6, v6
st_h \d_strd, v6, 4
ret
@@ -1655,7 +1723,7 @@ L(\type\()_8tap_v):
interleave_1_h v7, v16, v17, v18, v19
interleave_2_s v5, v6, v7, v16, v17, v18
uxtl_b v5, v6, v7, v16
- mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 4
b.le 0f
@@ -1673,7 +1741,7 @@ L(\type\()_8tap_v):
load_h \sr2, \src, \s_strd, v16, v17
interleave_1_h v7, v16, v17
uxtl_b v5, v6, v7, v16
- mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 2
0:
@@ -1698,13 +1766,13 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
shift_store_4 \type, \d_strd, v6
b.le 0f
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
uxtl_b v5, v6
- mul_mla_4 v7, v3, v4, v5, v6, .8h
+ mul_mla_4tap v7, v3, v4, v5, v6, .8h
shift_store_4 \type, \d_strd, v7
0:
ret
@@ -1729,28 +1797,28 @@ L(\type\()_8tap_v):
load_s \sr2, \src, \s_strd, v23, v24, v25, v26
interleave_1_s v22, v23, v24, v25, v26
uxtl_b v22, v23, v24, v25
- mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
shift_store_4 \type, \d_strd, v1, v2
b.le 0f
load_s \sr2, \src, \s_strd, v27, v16
subs \h, \h, #2
interleave_1_s v26, v27, v16
uxtl_b v26, v27
- mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
shift_store_4 \type, \d_strd, v1
b.le 0f
load_s \sr2, \src, \s_strd, v17, v18
subs \h, \h, #2
interleave_1_s v16, v17, v18
uxtl_b v16, v17
- mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
interleave_1_s v18, v19, v20, v21, v22
uxtl_b v18, v19, v20, v21
- mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b.gt 48b
0:
@@ -1773,14 +1841,14 @@ L(\type\()_8tap_v):
load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4, v5
- mul_mla_4 v6, v1, v2, v3, v4, .8h
- mul_mla_4 v7, v2, v3, v4, v5, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v7, v2, v3, v4, v5, .8h
shift_store_8 \type, \d_strd, v6, v7
b.le 0f
load_8b \sr2, \src, \s_strd, v6, v7
uxtl_b v6, v7
- mul_mla_4 v1, v3, v4, v5, v6, .8h
- mul_mla_4 v2, v4, v5, v6, v7, .8h
+ mul_mla_4tap v1, v3, v4, v5, v6, .8h
+ mul_mla_4tap v2, v4, v5, v6, v7, .8h
shift_store_8 \type, \d_strd, v1, v2
0:
ret
@@ -1809,32 +1877,32 @@ L(\type\()_8tap_v):
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v23, v24
uxtl_b v23, v24
- mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v25, v26
uxtl_b v25, v26
- mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
- mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
- mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
uxtl_b v19, v20, v21, v22
- mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
- mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.gt 88b
9:
@@ -1882,10 +1950,10 @@ L(\type\()_8tap_v):
uxtl2 v25.8h, v3.16b
uxtl2 v26.8h, v4.16b
uxtl2 v27.8h, v5.16b
- mul_mla_4 v1, v16, v17, v18, v19, .8h
- mul_mla_4 v16, v17, v18, v19, v20, .8h
- mul_mla_4 v2, v23, v24, v25, v26, .8h
- mul_mla_4 v17, v24, v25, v26, v27, .8h
+ mul_mla_4tap v1, v16, v17, v18, v19, .8h
+ mul_mla_4tap v16, v17, v18, v19, v20, .8h
+ mul_mla_4tap v2, v23, v24, v25, v26, .8h
+ mul_mla_4tap v17, v24, v25, v26, v27, .8h
shift_store_16 \type, \d_strd, v1, v2, v16, v17
b.le 0f
load_16b \sr2, \src, \s_strd, v6, v7
@@ -1893,25 +1961,25 @@ L(\type\()_8tap_v):
uxtl v22.8h, v7.8b
uxtl2 v28.8h, v6.16b
uxtl2 v29.8h, v7.16b
- mul_mla_4 v1, v18, v19, v20, v21, .8h
- mul_mla_4 v3, v19, v20, v21, v22, .8h
- mul_mla_4 v2, v25, v26, v27, v28, .8h
- mul_mla_4 v4, v26, v27, v28, v29, .8h
+ mul_mla_4tap v1, v18, v19, v20, v21, .8h
+ mul_mla_4tap v3, v19, v20, v21, v22, .8h
+ mul_mla_4tap v2, v25, v26, v27, v28, .8h
+ mul_mla_4tap v4, v26, v27, v28, v29, .8h
shift_store_16 \type, \d_strd, v1, v2, v3, v4
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_hv_tbl)
+ adr x9, L(\type\()_\taps\()_hv_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv):
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv):
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+ smull v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv):
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v28.8b}, [\sr2], \s_strd
ld1 {v30.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
@@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v30.4h, v0.h[3]
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
@@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #1
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v28.4h, v0.h[1]
mla v31.4h, v29.4h, v0.h[2]
mla v31.4h, v30.4h, v0.h[3]
+.ifc \taps, 6tap
+ srshr v18.4h, v31.4h, #2
+.else
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v28.8b
mov v20.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v28.8b
mov v22.8b, v29.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal v2.4s, v28.4h, v1.h[6]
+ smull v3.4s, v19.4h, v1.h[1]
+ smlal v3.4s, v20.4h, v1.h[2]
+ smlal v3.4s, v21.4h, v1.h[3]
+ smlal v3.4s, v22.4h, v1.h[4]
+ smlal v3.4s, v28.4h, v1.h[5]
+ smlal v3.4s, v29.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2):
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
subs \h, \h, #2
@@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.4h}, [\ds2], \d_strd
.endif
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v26.8b}, [\sr2], \s_strd
ld1 {v27.8b}, [\src], \s_strd
uxtl v26.8h, v26.8b
@@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #3
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+ mov v18.16b, v16.16b
+.else
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v24.16b
mov v20.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v24.16b
mov v22.16b, v25.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal2 v5.4s, v24.8h, v1.h[5]
+ smlal v2.4s, v24.4h, v1.h[6]
+ smlal2 v3.4s, v24.8h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[6]
+ smlal2 v5.4s, v25.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v24.8h, v1.h[7]
smlal v4.4s, v25.4h, v1.h[7]
smlal2 v5.4s, v25.8h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn2 v2.8h, v3.4s, #\shift_hv
sqrshrn v4.4h, v4.4s, #\shift_hv
@@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4):
st1 {v4.8h}, [\ds2], \d_strd
.endif
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2399,14 +2538,32 @@ L(\type\()_8tap_filter_4):
.else
add \dst, \dst, #16
.endif
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
+.ifc \taps, 6tap
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mul v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+.else // 8tap
mul v16.8h, v28.8h, v0.h[0]
ext v24.16b, v28.16b, v29.16b, #(2*1)
ext v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first):
mla v16.8h, v24.8h, v0.h[5]
mla v16.8h, v25.8h, v0.h[6]
mla v16.8h, v26.8h, v0.h[7]
+.endif
srshr v16.8h, v16.8h, #2
ret
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
ld1 {v30.8b, v31.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
uxtl v30.8h, v30.8b
uxtl v31.8h, v31.8b
+.ifc \taps, 6tap
+ ext v26.16b, v28.16b, v29.16b, #2
+ ext v27.16b, v30.16b, v31.16b, #2
+ mul v24.8h, v26.8h, v0.h[1]
+ mul v25.8h, v27.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v28.8h, v0.h[0]
mul v25.8h, v30.8h, v0.h[0]
.irpc i, 1234567
@@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8):
mla v24.8h, v26.8h, v0.h[\i]
mla v25.8h, v27.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
function \type\()_bilin_8bpc_neon, export=1
dup v1.16b, \mx
dup v3.16b, \my
@@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10