diff options
Diffstat (limited to 'third_party/dav1d/src/arm/64/mc16.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/mc16.S | 373 |
1 files changed, 263 insertions, 110 deletions
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S index 1bfb12ebb3..576fab158a 100644 --- a/third_party/dav1d/src/arm/64/mc16.S +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -1374,19 +1374,35 @@ endfunc sub \r3\wd, \r3\wd, \c\wd .endif .endm -.macro smull_smlal_4 d, s0, s1, s2, s3 +.macro smull_smlal_4tap d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm -.macro smull2_smlal2_4 d, s0, s1, s2, s3 +.macro smull2_smlal2_4tap d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm -.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] + smlal \d\().4s, \s4\().4h, v0.h[4] + smlal \d\().4s, \s5\().4h, v0.h[5] + smlal \d\().4s, \s6\().4h, v0.h[6] +.endm +.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] + smlal2 \d\().4s, \s4\().8h, v0.h[4] + smlal2 \d\().4s, \s5\().8h, v0.h[5] + smlal2 \d\().4s, \s6\().8h, v0.h[6] +.endm +.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] @@ -1396,7 +1412,7 @@ endfunc smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm -.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] @@ -1499,11 +1515,11 @@ endfunc st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_16bpc_neon, export=1 mov w9, \type_h mov w10, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1512,18 +1528,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps +function \type\()_\taps\()_neon .ifc \bdmax, w8 ldr w8, [sp] .endif @@ -1547,12 +1553,12 @@ function \type\()_8tap_neon add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1561,9 +1567,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x10, L(\type\()_8tap_h_tbl) + adr x10, L(\type\()_\taps\()_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1682,6 +1688,22 @@ L(\type\()_8tap_h): mov \mx, \w 8: +.ifc \taps, 6tap + ext v24.16b, v16.16b, v17.16b, #2 + ext v25.16b, v20.16b, v21.16b, #2 + smull v18.4s, v24.4h, v0.h[1] + smull2 v19.4s, v24.8h, v0.h[1] + smull v22.4s, v25.4h, v0.h[1] + smull2 v23.4s, v25.8h, v0.h[1] +.irpc i, 23456 + ext v24.16b, v16.16b, v17.16b, #(2*\i) + ext v25.16b, v20.16b, v21.16b, #(2*\i) + smlal v18.4s, v24.4h, v0.h[\i] + smlal2 v19.4s, v24.8h, v0.h[\i] + smlal v22.4s, v25.4h, v0.h[\i] + smlal2 v23.4s, v25.8h, v0.h[\i] +.endr +.else // 8tap smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] @@ -1694,6 +1716,7 @@ L(\type\()_8tap_h): smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr +.endif subs \mx, \mx, #8 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) @@ -1734,18 +1757,18 @@ L(\type\()_8tap_h): b.gt 81b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -1758,7 +1781,7 @@ L(\type\()_8tap_v): dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - adr x10, L(\type\()_8tap_v_tbl) + adr x10, L(\type\()_\taps\()_v_tbl) ldrh w9, [x10, x9, lsl #1] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1785,7 +1808,7 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f - smull_smlal_4 v6, v1, v2, v3, v4 + smull_smlal_4tap v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 @@ -1794,8 +1817,8 @@ L(\type\()_8tap_v): 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 - smull_smlal_4 v16, v1, v2, v3, v4 - smull_smlal_4 v17, v3, v4, v5, v6 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull_smlal_4tap v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 @@ -1817,8 +1840,8 @@ L(\type\()_8tap_v): subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 - smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 @@ -1836,7 +1859,7 @@ L(\type\()_8tap_v): 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 @@ -1860,13 +1883,13 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v6, v1, v2, v3, v4 - smull_smlal_4 v7, v2, v3, v4, v5 + smull_smlal_4tap v6, v1, v2, v3, v4 + smull_smlal_4tap v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v1, v3, v4, v5, v6 - smull_smlal_4 v2, v4, v5, v6, v7 + smull_smlal_4tap v1, v3, v4, v5, v6 + smull_smlal_4tap v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1885,10 +1908,10 @@ L(\type\()_8tap_v): 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 - smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 @@ -1903,8 +1926,8 @@ L(\type\()_8tap_v): b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1925,17 +1948,17 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v16, v1, v2, v3, v4 - smull2_smlal2_4 v17, v1, v2, v3, v4 - smull_smlal_4 v18, v2, v3, v4, v5 - smull2_smlal2_4 v19, v2, v3, v4, v5 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull2_smlal2_4tap v17, v1, v2, v3, v4 + smull_smlal_4tap v18, v2, v3, v4, v5 + smull2_smlal2_4tap v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v16, v3, v4, v5, v6 - smull2_smlal2_4 v17, v3, v4, v5, v6 - smull_smlal_4 v18, v4, v5, v6, v7 - smull2_smlal2_4 v19, v4, v5, v6, v7 + smull_smlal_4tap v16, v3, v4, v5, v6 + smull2_smlal2_4tap v17, v3, v4, v5, v6 + smull_smlal_4tap v18, v4, v5, v6, v7 + smull2_smlal2_4tap v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret @@ -1962,18 +1985,18 @@ L(\type\()_8tap_v): 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 - smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 + smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 - smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 - smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 - smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 + smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 + smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b @@ -2013,10 +2036,10 @@ L(\type\()_8tap_v): 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 - smull_smlal_4 v1, v16, v18, v20, v22 - smull2_smlal2_4 v2, v16, v18, v20, v22 - smull_smlal_4 v3, v17, v19, v21, v23 - smull2_smlal2_4 v4, v17, v19, v21, v23 + smull_smlal_4tap v1, v16, v18, v20, v22 + smull2_smlal2_4tap v2, v16, v18, v20, v22 + smull_smlal_4tap v3, v17, v19, v21, v23 + smull2_smlal2_4tap v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b @@ -2029,17 +2052,17 @@ L(\type\()_8tap_v): 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x11, \my, uxtw #3 - adr x10, L(\type\()_8tap_hv_tbl) + adr x10, L(\type\()_\taps\()_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv): addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores @@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv): mov v17.8b, v24.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v24.8b, #4 +.ifc \taps, 6tap + smull v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv): smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] +.endif srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s @@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 @@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2): // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2): // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). +.ifc \taps, 6tap + xtn v18.4h, v16.4s +.else xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v19.4h, v1.h[2] + smlal v3.4s, v20.4h, v1.h[3] + smlal v3.4s, v21.4h, v1.h[4] + smlal v3.4s, v22.4h, v1.h[5] + smlal v3.4s, v24.4h, v1.h[6] + smull v4.4s, v19.4h, v1.h[1] + smlal v4.4s, v20.4h, v1.h[2] + smlal v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v22.4h, v1.h[4] + smlal v4.4s, v24.4h, v1.h[5] + smlal v4.4s, v25.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2): smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] +.endif .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) @@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2): st1 {v3.d}[0], [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 @@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4): // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4): lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v26.16b, v27.16b, v28.16b, #2 + smull v24.4s, v26.4h, v0.h[1] + smull2 v25.4s, v26.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr +.else // 8tap smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 @@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4): smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr +.endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without @@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). +.ifc \taps, 6tap + uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 +.else uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v23.4h, v1.h[5] + smlal2 v5.4s, v23.8h, v1.h[5] + smlal v2.4s, v23.4h, v1.h[6] + smlal2 v3.4s, v23.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] +.endif .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) @@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4): st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4): mov \h, \my add \src, \src, #16 add \dst, \dst, #16 +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 +.endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v23.16b, v4.16b, v5.16b, #2 + ext v24.16b, v6.16b, v7.16b, #2 + smull v25.4s, v23.4h, v0.h[1] + smull2 v26.4s, v23.8h, v0.h[1] + smull v27.4s, v24.4h, v0.h[1] + smull2 v28.4s, v24.8h, v0.h[1] +.irpc i, 23456 + ext v23.16b, v4.16b, v5.16b, #(2*\i) + ext v24.16b, v6.16b, v7.16b, #(2*\i) + smlal v25.4s, v23.4h, v0.h[\i] + smlal2 v26.4s, v23.8h, v0.h[\i] + smlal v27.4s, v24.4h, v0.h[\i] + smlal2 v28.4s, v24.8h, v0.h[\i] +.endr +.else // 8tap smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] @@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8): smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr +.endif srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) @@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8): uzp1 v24.8h, v27.8h, v28.8h // Ditto ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] @@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 -filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap +filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 + .macro load_filter_row dst, src, inc asr w13, \src, #10 |