From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/arm/64/mc.S | 411 ++++++++++++++++++++++++++++---------- 1 file changed, 305 insertions(+), 106 deletions(-) (limited to 'third_party/dav1d/src/arm/64/mc.S') diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 9f7b4e7a89..3df0393c3a 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -1154,7 +1154,7 @@ endfunc uxtl \r6\().8h, \r6\().8b .endif .endm -.macro mul_mla_4 d, s0, s1, s2, s3, wd +.macro mul_mla_4tap d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] @@ -1163,7 +1163,51 @@ endfunc // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. -.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().4h, \s1\().4h, v0.h[1] + mla \d0\().4h, \s2\().4h, v0.h[2] + mla \d0\().4h, \s3\().4h, v0.h[3] + mla \d0\().4h, \s4\().4h, v0.h[4] + mla \d0\().4h, \s5\().4h, v0.h[5] + mla \d0\().4h, \s6\().4h, v0.h[6] +.endm +.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s2\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s3\().8h, v0.h[1] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d1\().8h, \s8\().8h, v0.h[6] +.endm +.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] @@ -1173,7 +1217,7 @@ endfunc mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm -.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1183,7 +1227,7 @@ endfunc mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm -.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1201,7 +1245,7 @@ endfunc mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm -.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 +.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1315,11 +1359,11 @@ endfunc .endif .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_8bpc_neon, export=1 mov x8, \type_h mov x9, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1328,18 +1372,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps +function \type\()_\taps\()_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 @@ -1354,12 +1388,12 @@ function \type\()_8tap_neon tst \mx, #(0x7f << 14) sub w8, w8, #24 movrel x10, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1368,9 +1402,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x9, L(\type\()_8tap_h_tbl) + adr x9, L(\type\()_\taps\()_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1471,6 +1505,18 @@ L(\type\()_8tap_h): uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b +.ifc \taps, 6tap + ext v19.16b, v16.16b, v17.16b, #2 + ext v23.16b, v20.16b, v21.16b, #2 + mul v18.8h, v19.8h, v0.h[1] + mul v22.8h, v23.8h, v0.h[1] +.irpc i, 23456 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr +.else // 8tap mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 @@ -1479,6 +1525,7 @@ L(\type\()_8tap_h): mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr +.endif subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 @@ -1523,6 +1570,26 @@ L(\type\()_8tap_h): uxtl v22.8h, v22.8b 16: +.ifc \taps, 6tap + ext v28.16b, v16.16b, v17.16b, #2 + ext v29.16b, v17.16b, v18.16b, #2 + ext v30.16b, v20.16b, v21.16b, #2 + ext v31.16b, v21.16b, v22.16b, #2 + mul v24.8h, v28.8h, v0.h[1] + mul v25.8h, v29.8h, v0.h[1] + mul v26.8h, v30.8h, v0.h[1] + mul v27.8h, v31.8h, v0.h[1] +.irpc i, 23456 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] @@ -1537,6 +1604,7 @@ L(\type\()_8tap_h): mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 @@ -1575,18 +1643,18 @@ L(\type\()_8tap_h): b.gt 161b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1595,7 +1663,7 @@ L(\type\()_8tap_v): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_v_tbl) + adr x9, L(\type\()_\taps\()_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1620,7 +1688,7 @@ L(\type\()_8tap_v): interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .4h + mul_mla_4tap v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret @@ -1630,7 +1698,7 @@ L(\type\()_8tap_v): interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret @@ -1655,7 +1723,7 @@ L(\type\()_8tap_v): interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 - mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f @@ -1673,7 +1741,7 @@ L(\type\()_8tap_v): load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 - mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: @@ -1698,13 +1766,13 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 - mul_mla_4 v7, v3, v4, v5, v6, .8h + mul_mla_4tap v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret @@ -1729,28 +1797,28 @@ L(\type\()_8tap_v): load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 - mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 - mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 + mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 - mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 + mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 - mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: @@ -1773,14 +1841,14 @@ L(\type\()_8tap_v): load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 - mul_mla_4 v6, v1, v2, v3, v4, .8h - mul_mla_4 v7, v2, v3, v4, v5, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h + mul_mla_4tap v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 - mul_mla_4 v1, v3, v4, v5, v6, .8h - mul_mla_4 v2, v4, v5, v6, v7, .8h + mul_mla_4tap v1, v3, v4, v5, v6, .8h + mul_mla_4tap v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret @@ -1809,32 +1877,32 @@ L(\type\()_8tap_v): subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 - mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 - mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 - mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 - mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 - mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 - mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: @@ -1882,10 +1950,10 @@ L(\type\()_8tap_v): uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b - mul_mla_4 v1, v16, v17, v18, v19, .8h - mul_mla_4 v16, v17, v18, v19, v20, .8h - mul_mla_4 v2, v23, v24, v25, v26, .8h - mul_mla_4 v17, v24, v25, v26, v27, .8h + mul_mla_4tap v1, v16, v17, v18, v19, .8h + mul_mla_4tap v16, v17, v18, v19, v20, .8h + mul_mla_4tap v2, v23, v24, v25, v26, .8h + mul_mla_4tap v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 @@ -1893,25 +1961,25 @@ L(\type\()_8tap_v): uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b - mul_mla_4 v1, v18, v19, v20, v21, .8h - mul_mla_4 v3, v19, v20, v21, v22, .8h - mul_mla_4 v2, v25, v26, v27, v28, .8h - mul_mla_4 v4, v26, v27, v28, v29, .8h + mul_mla_4tap v1, v18, v19, v20, v21, .8h + mul_mla_4tap v3, v19, v20, v21, v22, .8h + mul_mla_4tap v2, v25, v26, v27, v28, .8h + mul_mla_4tap v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_hv_tbl) + adr x9, L(\type\()_\taps\()_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv): addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv): addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v28.8b, #4 +.ifc \taps, 6tap + smull v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv): smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b @@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2): mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. @@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2): mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] +.ifc \taps, 6tap + srshr v18.4h, v31.4h, #2 +.else srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal v2.4s, v28.4h, v1.h[6] + smull v3.4s, v19.4h, v1.h[1] + smlal v3.4s, v20.4h, v1.h[2] + smlal v3.4s, v21.4h, v1.h[3] + smlal v3.4s, v22.4h, v1.h[4] + smlal v3.4s, v28.4h, v1.h[5] + smlal v3.4s, v29.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2): smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 @@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2): st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b @@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) +.ifc \taps, 6tap + mov v18.16b, v16.16b +.else + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v24.4h, v1.h[5] + smlal2 v5.4s, v24.8h, v1.h[5] + smlal v2.4s, v24.4h, v1.h[6] + smlal2 v3.4s, v24.8h, v1.h[6] + smlal v4.4s, v25.4h, v1.h[6] + smlal2 v5.4s, v25.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv @@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4): st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2398,15 +2537,33 @@ L(\type\()_8tap_filter_4): add \dst, \dst, #8 .else add \dst, \dst, #16 +.endif +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 .endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8_first): +L(\type\()_\taps\()_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b +.ifc \taps, 6tap + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mul v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] +.else // 8tap mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) @@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first): mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] +.endif srshr v16.8h, v16.8h, #2 ret -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b +.ifc \taps, 6tap + ext v26.16b, v28.16b, v29.16b, #2 + ext v27.16b, v30.16b, v31.16b, #2 + mul v24.8h, v26.8h, v0.h[1] + mul v25.8h, v27.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 @@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8): mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my @@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 -filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap +filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 + .macro load_filter_row dst, src, inc asr w13, \src, #10 -- cgit v1.2.3