summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/64/mc16.S
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S373
1 files changed, 263 insertions, 110 deletions
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
index 1bfb12ebb3..576fab158a 100644
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@ endfunc
sub \r3\wd, \r3\wd, \c\wd
.endif
.endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
.endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
.endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
@@ -1396,7 +1412,7 @@ endfunc
smlal \d\().4s, \s6\().4h, v0.h[6]
smlal \d\().4s, \s7\().4h, v0.h[7]
.endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
@@ -1499,11 +1515,11 @@ endfunc
st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
mov w9, \type_h
mov w10, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1512,18 +1528,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
.ifc \bdmax, w8
ldr w8, [sp]
.endif
@@ -1547,12 +1553,12 @@ function \type\()_8tap_neon
add w13, w12, \bdmax // 6 + intermediate_bits
sub w12, w12, \bdmax // 6 - intermediate_bits
movrel x11, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w10, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1561,9 +1567,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x11, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x10, L(\type\()_8tap_h_tbl)
+ adr x10, L(\type\()_\taps\()_h_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@ L(\type\()_8tap_h):
mov \mx, \w
8:
+.ifc \taps, 6tap
+ ext v24.16b, v16.16b, v17.16b, #2
+ ext v25.16b, v20.16b, v21.16b, #2
+ smull v18.4s, v24.4h, v0.h[1]
+ smull2 v19.4s, v24.8h, v0.h[1]
+ smull v22.4s, v25.4h, v0.h[1]
+ smull2 v23.4s, v25.8h, v0.h[1]
+.irpc i, 23456
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v18.4s, v16.4h, v0.h[0]
smull2 v19.4s, v16.8h, v0.h[0]
smull v22.4s, v20.4h, v0.h[0]
@@ -1694,6 +1716,7 @@ L(\type\()_8tap_h):
smlal v22.4s, v25.4h, v0.h[\i]
smlal2 v23.4s, v25.8h, v0.h[\i]
.endr
+.endif
subs \mx, \mx, #8
srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@ L(\type\()_8tap_h):
b.gt 81b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -1758,7 +1781,7 @@ L(\type\()_8tap_v):
dup v30.4s, w12 // 6 - intermediate_bits
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- adr x10, L(\type\()_8tap_v_tbl)
+ adr x10, L(\type\()_\taps\()_v_tbl)
ldrh w9, [x10, x9, lsl #1]
.ifc \type, prep
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
b.gt 24f
- smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4tap v6, v1, v2, v3, v4
sqrshrun_h 6, v6
umin_h v31, .8h, v6
st_s \d_strd, v6, 2
@@ -1794,8 +1817,8 @@ L(\type\()_8tap_v):
24: // 2x4 v
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
- smull_smlal_4 v16, v1, v2, v3, v4
- smull_smlal_4 v17, v3, v4, v5, v6
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull_smlal_4tap v17, v3, v4, v5, v6
sqrshrun_h 6, v16, v17
umin_h v31, .8h, v16
st_s \d_strd, v16, 4
@@ -1817,8 +1840,8 @@ L(\type\()_8tap_v):
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
interleave_1_s v7, v16, v17, v18, v19
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
- smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18
sqrshrun_h 6, v24, v25
umin_h v31, .8h, v24
st_s \d_strd, v24, 4
@@ -1836,7 +1859,7 @@ L(\type\()_8tap_v):
26:
load_s \sr2, \src, \s_strd, v16, v17
interleave_1_s v7, v16, v17
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_h 6, v24
umin_h v31, .4h, v24
st_s \d_strd, v24, 2
@@ -1860,13 +1883,13 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v6, v1, v2, v3, v4
- smull_smlal_4 v7, v2, v3, v4, v5
+ smull_smlal_4tap v6, v1, v2, v3, v4
+ smull_smlal_4tap v7, v2, v3, v4, v5
shift_store_4 \type, \d_strd, v6, v7
b.le 0f
load_4h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v1, v3, v4, v5, v6
- smull_smlal_4 v2, v4, v5, v6, v7
+ smull_smlal_4tap v1, v3, v4, v5, v6
+ smull_smlal_4tap v2, v4, v5, v6, v7
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1885,10 +1908,10 @@ L(\type\()_8tap_v):
48:
subs \h, \h, #4
load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
- smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_4 \type, \d_strd, v1, v2, v3, v4
b.le 0f
cmp \h, #2
@@ -1903,8 +1926,8 @@ L(\type\()_8tap_v):
b 48b
46:
load_4h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1925,17 +1948,17 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v16, v1, v2, v3, v4
- smull2_smlal2_4 v17, v1, v2, v3, v4
- smull_smlal_4 v18, v2, v3, v4, v5
- smull2_smlal2_4 v19, v2, v3, v4, v5
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull2_smlal2_4tap v17, v1, v2, v3, v4
+ smull_smlal_4tap v18, v2, v3, v4, v5
+ smull2_smlal2_4tap v19, v2, v3, v4, v5
shift_store_8 \type, \d_strd, v16, v17, v18, v19
b.le 0f
load_8h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v16, v3, v4, v5, v6
- smull2_smlal2_4 v17, v3, v4, v5, v6
- smull_smlal_4 v18, v4, v5, v6, v7
- smull2_smlal2_4 v19, v4, v5, v6, v7
+ smull_smlal_4tap v16, v3, v4, v5, v6
+ smull2_smlal2_4tap v17, v3, v4, v5, v6
+ smull_smlal_4tap v18, v4, v5, v6, v7
+ smull2_smlal2_4tap v19, v4, v5, v6, v7
shift_store_8 \type, \d_strd, v16, v17, v18, v19
0:
ret
@@ -1962,18 +1985,18 @@ L(\type\()_8tap_v):
88:
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
- smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v25, v26
- smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
- smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
- smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
mov v16.16b, v20.16b
@@ -2013,10 +2036,10 @@ L(\type\()_8tap_v):
16:
load_16h \src, \src, \s_strd, v22, v23
subs \h, \h, #1
- smull_smlal_4 v1, v16, v18, v20, v22
- smull2_smlal2_4 v2, v16, v18, v20, v22
- smull_smlal_4 v3, v17, v19, v21, v23
- smull2_smlal2_4 v4, v17, v19, v21, v23
+ smull_smlal_4tap v1, v16, v18, v20, v22
+ smull2_smlal2_4tap v2, v16, v18, v20, v22
+ smull_smlal_4tap v3, v17, v19, v21, v23
+ smull2_smlal2_4tap v4, v17, v19, v21, v23
shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
b.le 0f
mov v16.16b, v18.16b
@@ -2029,17 +2052,17 @@ L(\type\()_8tap_v):
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x11, \my, uxtw #3
- adr x10, L(\type\()_8tap_hv_tbl)
+ adr x10, L(\type\()_\taps\()_hv_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv):
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
// The intermediates from the horizontal pass fit in 16 bit without
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv):
mov v17.8b, v24.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
xtn v16.4h, v16.4s
trn1 v16.2s, v16.2s, v24.2s
mov v17.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
mov v19.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v24.8b, #4
mov v21.8b, v24.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v24.8b, #4
+.ifc \taps, 6tap
+ smull v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv):
smlal v3.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[6]
smlal v3.4s, v24.4h, v1.h[7]
+.endif
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v25.8h}, [\sr2], \s_strd
ld1 {v27.8h}, [\src], \s_strd
ext v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2):
// (at the cost of a smaller slowdown on in-order cores such as A53).
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #2
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2):
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+ xtn v18.4h, v16.4s
+.else
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v24.8b
mov v20.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v24.8b
mov v22.8b, v25.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v24.4h, v1.h[6]
+ smull v4.4s, v19.4h, v1.h[1]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal v4.4s, v25.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2):
smlal v4.4s, v22.4h, v1.h[5]
smlal v4.4s, v24.4h, v1.h[6]
smlal v4.4s, v25.4h, v1.h[7]
+.endif
.ifc \type, put
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.d}[0], [\dst], \d_strd
st1 {v3.d}[1], [\ds2], \d_strd
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v24.8h}, [\sr2], \s_strd
ld1 {v25.8h}, [\src], \s_strd
ext v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4):
// and conserves register space (no need to clobber v8-v15).
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #6
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4):
lsl \s_strd, \s_strd, #1
ld1 {v27.8h, v28.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v26.16b, v27.16b, v28.16b, #2
+ smull v24.4s, v26.4h, v0.h[1]
+ smull2 v25.4s, v26.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v24.4s, v27.4h, v0.h[0]
smull2 v25.4s, v27.8h, v0.h[0]
.irpc i, 1234567
@@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4):
smlal v24.4s, v26.4h, v0.h[\i]
smlal2 v25.4s, v26.8h, v0.h[\i]
.endr
+.endif
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
// The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+ uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2
+.else
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v23.16b
mov v20.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v23.16b
mov v22.16b, v24.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v23.4h, v1.h[5]
+ smlal2 v5.4s, v23.8h, v1.h[5]
+ smlal v2.4s, v23.4h, v1.h[6]
+ smlal2 v3.4s, v23.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v23.8h, v1.h[7]
smlal v4.4s, v24.4h, v1.h[7]
smlal2 v5.4s, v24.8h, v1.h[7]
+.endif
.ifc \type, put
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4):
st1 {v2.8h}, [\dst], \d_strd
st1 {v3.8h}, [\ds2], \d_strd
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4):
mov \h, \my
add \src, \src, #16
add \dst, \dst, #16
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
ld1 {v6.8h, v7.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v23.16b, v4.16b, v5.16b, #2
+ ext v24.16b, v6.16b, v7.16b, #2
+ smull v25.4s, v23.4h, v0.h[1]
+ smull2 v26.4s, v23.8h, v0.h[1]
+ smull v27.4s, v24.4h, v0.h[1]
+ smull2 v28.4s, v24.8h, v0.h[1]
+.irpc i, 23456
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v25.4s, v4.4h, v0.h[0]
smull2 v26.4s, v4.8h, v0.h[0]
smull v27.4s, v6.4h, v0.h[0]
@@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8):
smlal v27.4s, v24.4h, v0.h[\i]
smlal2 v28.4s, v24.8h, v0.h[\i]
.endr
+.endif
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8):
uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
function \type\()_bilin_16bpc_neon, export=1
.ifc \bdmax, w8
ldr w8, [sp]
@@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10