From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/arm/32/itx.S | 79 ++++--- third_party/dav1d/src/arm/32/itx16.S | 19 +- third_party/dav1d/src/arm/32/msac.S | 167 +++++++------- third_party/dav1d/src/arm/64/itx.S | 99 ++++----- third_party/dav1d/src/arm/64/itx16.S | 21 +- third_party/dav1d/src/arm/64/mc.S | 411 ++++++++++++++++++++++++++--------- third_party/dav1d/src/arm/64/mc16.S | 373 +++++++++++++++++++++---------- third_party/dav1d/src/arm/64/msac.S | 167 +++++++------- third_party/dav1d/src/arm/64/util.S | 49 +++++ third_party/dav1d/src/arm/asm.S | 44 ++++ third_party/dav1d/src/arm/cpu.c | 137 +++++++++--- third_party/dav1d/src/arm/cpu.h | 4 + third_party/dav1d/src/arm/itx.h | 4 +- third_party/dav1d/src/arm/msac.h | 2 +- 14 files changed, 1086 insertions(+), 490 deletions(-) (limited to 'third_party/dav1d/src/arm') diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S index ceea025e45..9ba1df7a68 100644 --- a/third_party/dav1d/src/arm/32/itx.S +++ b/third_party/dav1d/src/arm/32/itx.S @@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x8_epilog) .else blx r4 @@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon vrshr.s16 q13, q13, #1 vrshr.s16 q14, q14, #1 vrshr.s16 q15, q15, #1 -.endif +L(itx_8x8_epilog): transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 blx r5 @@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 @@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon .else identity_4x16_shift1 d0[0] .endif + b L(horz_16x4_epilog) .else blx r4 -.endif -.if \shift > 0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #\shift .endr -.endif +.if \shift == 1 + b L(horz_16x4_epilog) +.else +L(horz_16x4_epilog): transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 @@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon .endr pop {pc} +.endif +.endif endfunc .endm -def_horz_16 scale=0, identity=0, shift=2 -def_horz_16 scale=1, identity=0, shift=1, suffix=_scale -def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} @@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon .endr identity_4x16_shift1 d0[0] + + b L(itx_16x4_epilog) .else vmov.i16 q2, #0 vmov.i16 q3, #0 @@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon vswp d19, d22 vswp d18, d20 vswp d19, d21 -.irp i, q8, q9, q10, q11 + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #1 .endr -.endif + +L(itx_16x4_epilog): transpose_4x8h q8, q9, q10, q11 blx r5 mov r6, r0 load_add_store_8x4 r6, r7 -.ifc \variant, identity_ vmov q8, q12 vmov q9, q13 vmov q10, q14 vmov q11, q15 -.else - vswp d25, d28 - vswp d27, d30 - vswp d26, d28 - vswp d27, d29 - vrshr.s16 q8, q12, #1 - vrshr.s16 q9, q13, #1 - vrshr.s16 q10, q14, #1 - vrshr.s16 q11, q15, #1 -.endif + transpose_4x8h q8, q9, q10, q11 blx r5 add r6, r0, #8 @@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q8, q9, q10, q11, d0[0] + + b L(itx_4x16_epilog) .else blx r4 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr -.endif +L(itx_4x16_epilog): transpose_4x8h q8, q9, q10, q11 vswp d19, d21 vswp d18, d20 @@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_416_base def_fn_416_base identity_ +def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 4 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon mov r10, #\eob_half .else +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity @@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32 def_fns_416 4, 16 def_fns_416 16, 4 -.macro def_fn_816_base variant -function inv_txfm_\variant\()add_16x8_neon +function inv_txfm_add_16x8_neon sub_sp_align 256 .irp i, 0, 4 @@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon pop {r4-r11,pc} endfunc +.macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_neon sub_sp_align 256 @@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon .endr 2: +.ifc \variant, identity_ + b L(itx_8x16_epilog) +.else +L(itx_8x16_epilog): .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) @@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_816_base def_fn_816_base identity_ +def_fn_816_base + +/* Define symbols used in .if statement */ +.equ dct, 1 +.equ identity, 2 +.equ adst, 3 +.equ flipadst, 4 .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 8 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x8_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .else .ifc \txfm1, identity @@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .else mov r10, #\eob_4x4 .endif -.ifc \txfm1, identity +.if \w == 8 && \txfm1 == identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S index aa6c272e71..7691272517 100644 --- a/third_party/dav1d/src/arm/32/itx16.S +++ b/third_party/dav1d/src/arm/32/itx16.S @@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q14, q15}, [r2, :128]! - vshr.s16 q8, q8, #2 + vshr.s32 q8, q8, #2 vld1.32 {q10, q11}, [r2, :128] - vshr.s16 q9, q9, #2 - vshr.s16 q10, q10, #2 - vshr.s16 q11, q11, #2 + vshr.s32 q9, q9, #2 + vshr.s32 q10, q10, #2 + vshr.s32 q11, q11, #2 iwht4 @@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon vld1.16 {d3}, [r0, :64], r1 L(itx_4x4_end): - vmvn.i16 q15, #0xfc00 // 0x3ff + // read bitdepth_max from the callers stack + ldr r4, [sp, #44] + vdup.i16 q15, r4 sub r0, r0, r1, lsl #2 vqadd.s16 q8, q8, q0 vqadd.s16 q9, q9, q1 @@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon vqrshrn.s32 d21, q13, #\shift vqrshrn.s32 d22, q14, #\shift vqrshrn.s32 d23, q15, #\shift +.if \scale + b L(horz_16x2_epilog) +.else +L(horz_16x2_epilog): vuzp.16 q8, q9 vuzp.16 q10, q11 @@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon .endr pop {pc} +.endif endfunc .endm -def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale +def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S index b06e109dda..b16957fb7e 100644 --- a/third_party/dav1d/src/arm/32/msac.S +++ b/third_party/dav1d/src/arm/32/msac.S @@ -279,60 +279,67 @@ L(renorm): sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) L(renorm2): lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] - mvn r7, r7 // ~dif - bhs 9f + bhs 4f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 8 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 6f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +2: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 8 - c -9: +3: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +4: // end str r6, [r0, #CNT] str r7, [r0, #DIF] - mov r0, lr add sp, sp, #48 - pop {r4-r10,pc} + +5: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 3b + +6: // refill_eob + cmp r3, r4 + bhs 5b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 @@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1 sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] vdup.16 d1, r4 - mvn r7, r7 // ~dif - bhs 9f + bhs 5f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 40 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 7f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +3: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 40 - c -9: +4: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +5: // end lsl lr, lr, #1 sub lr, lr, #5 lsr r12, r7, #16 @@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1 str r7, [r0, #DIF] lsr r0, r2, #1 pop {r4-r10,pc} + +6: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 4b + +7: // refill_eob + cmp r3, r4 + bhs 6b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 3b endfunc function msac_decode_bool_equi_neon, export=1 @@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1 cmp r10, #0 clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S index 53490cd677..7063cbde1d 100644 --- a/third_party/dav1d/src/arm/64/itx.S +++ b/third_party/dav1d/src/arm/64/itx.S @@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x8_epilog) .else blr x4 @@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon srshr v21.8h, v21.8h, #1 srshr v22.8h, v22.8h, #1 srshr v23.8h, v23.8h, #1 -.endif +L(itx_8x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 blr x5 load_add_store_8x8 x0, x7 ret x15 +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 @@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon .endif .if \identity identity_8x16_shift2 v0.h[0] + b L(horz_16x8_epilog) .else blr x4 -.endif -.if \shift > 0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #\shift .endr -.endif +.if \shift == 1 + b L(horz_16x8_epilog) +.else +L(horz_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 @@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon .endr ret x14 +.endif +.endif endfunc .endm -def_horz_16 scale=0, identity=0, shift=2 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=1, shift=0, suffix=_identity +def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 @@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon .endr identity_8x16_shift1 v0.h[0] + + b L(itx_16x4_epilog) .else .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h ld1 {\i}, [x2] @@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr -.endif - transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 - blr x5 - mov x6, x0 - load_add_store_8x4 x6, x7 -.ifc \variant, identity_ - mov v16.16b, v20.16b - mov v17.16b, v21.16b - mov v18.16b, v22.16b - mov v19.16b, v23.16b -.else ins v24.d[1], v28.d[0] ins v25.d[1], v29.d[0] ins v26.d[1], v30.d[0] ins v27.d[1], v31.d[0] - srshr v16.8h, v24.8h, #1 - srshr v17.8h, v25.8h, #1 - srshr v18.8h, v26.8h, #1 - srshr v19.8h, v27.8h, #1 -.endif + srshr v20.8h, v24.8h, #1 + srshr v21.8h, v25.8h, #1 + srshr v22.8h, v26.8h, #1 + srshr v23.8h, v27.8h, #1 + +L(itx_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19 + blr x5 add x6, x0, #8 load_add_store_8x4 x6, x7 ret x15 +.endif endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] + + b L(itx_4x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr -.endif +L(itx_4x16_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] @@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon load_add_store_4x16 x0, x6 ret x15 +.endif endfunc .endm -def_fn_416_base def_fn_416_base identity_ +def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 idct_dc \w, \h, 1 .endif .if \w == 4 +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_4h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity @@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon mov w16, #2*(5793-4096)*8 dup v0.4h, w16 identity_8x16_shift1 v0.h[0] + + b L(itx_16x8_epilog) .else blr x4 -.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #1 .endr -.endif + +L(itx_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 @@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon mov x6, x0 load_add_store_8x8 x6, x7 -.ifc \variant, identity_ - mov v16.16b, v24.16b - mov v17.16b, v25.16b - mov v18.16b, v26.16b - mov v19.16b, v27.16b - mov v20.16b, v28.16b - mov v21.16b, v29.16b - mov v22.16b, v30.16b - mov v23.16b, v31.16b -.else - srshr v16.8h, v24.8h, #1 - srshr v17.8h, v25.8h, #1 - srshr v18.8h, v26.8h, #1 - srshr v19.8h, v27.8h, #1 - srshr v20.8h, v28.8h, #1 - srshr v21.8h, v29.8h, #1 - srshr v22.8h, v30.8h, #1 - srshr v23.8h, v31.8h, #1 -.endif - - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23 blr x5 @@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon load_add_store_8x8 x0, x7 ret x15 +.endif endfunc function inv_txfm_\variant\()add_8x16_neon @@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h srshr \i, \i, #1 .endr -.endif +L(itx_8x16_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 @@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon load_add_store_8x16 x0, x6 ret x15 +.endif endfunc .endm -def_fn_816_base def_fn_816_base identity_ +def_fn_816_base .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S index eee3a9636d..31ee9be1b4 100644 --- a/third_party/dav1d/src/arm/64/itx16.S +++ b/third_party/dav1d/src/arm/64/itx16.S @@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 b L(itx_4x4_end) endfunc +// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers +// x0-x4 external parameters +// x5 function pointer to first transform +// x6 function pointer to second transform function inv_txfm_add_4x4_neon movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 - blr x4 + blr x5 st1 {v30.4s, v31.4s}, [x2], #32 sqxtn v16.4h, v16.4s @@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 - blr x5 + blr x6 ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 @@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon srshr v18.8h, v18.8h, #4 L(itx_4x4_end): - mvni v31.8h, #0xfc, lsl #8 // 0x3ff + dup v31.8h, w4 sub x0, x0, x1, lsl #2 usqadd v0.8h, v16.8h usqadd v1.8h, v18.8h @@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 b L(itx_4x4_end) 1: .endif - adr x4, inv_\txfm1\()_4s_x4_neon - movrel x5, X(inv_\txfm2\()_4h_x4_neon) + adr x5, inv_\txfm1\()_4s_x4_neon + movrel x6, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm @@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon sqrshrn2 v21.8h, v29.4s, #\shift sqrshrn2 v22.8h, v30.4s, #\shift sqrshrn2 v23.8h, v31.4s, #\shift +.if \scale + b L(horz_16x4_epilog) +.else +L(horz_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 @@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon .endr ret x14 +.endif endfunc .endm -def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale +def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 9f7b4e7a89..3df0393c3a 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -1154,7 +1154,7 @@ endfunc uxtl \r6\().8h, \r6\().8b .endif .endm -.macro mul_mla_4 d, s0, s1, s2, s3, wd +.macro mul_mla_4tap d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] @@ -1163,7 +1163,51 @@ endfunc // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. -.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().4h, \s1\().4h, v0.h[1] + mla \d0\().4h, \s2\().4h, v0.h[2] + mla \d0\().4h, \s3\().4h, v0.h[3] + mla \d0\().4h, \s4\().4h, v0.h[4] + mla \d0\().4h, \s5\().4h, v0.h[5] + mla \d0\().4h, \s6\().4h, v0.h[6] +.endm +.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s2\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s3\().8h, v0.h[1] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d1\().8h, \s8\().8h, v0.h[6] +.endm +.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] @@ -1173,7 +1217,7 @@ endfunc mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm -.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1183,7 +1227,7 @@ endfunc mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm -.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1201,7 +1245,7 @@ endfunc mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm -.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 +.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1315,11 +1359,11 @@ endfunc .endif .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_8bpc_neon, export=1 mov x8, \type_h mov x9, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1328,18 +1372,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps +function \type\()_\taps\()_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 @@ -1354,12 +1388,12 @@ function \type\()_8tap_neon tst \mx, #(0x7f << 14) sub w8, w8, #24 movrel x10, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1368,9 +1402,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x9, L(\type\()_8tap_h_tbl) + adr x9, L(\type\()_\taps\()_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1471,6 +1505,18 @@ L(\type\()_8tap_h): uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b +.ifc \taps, 6tap + ext v19.16b, v16.16b, v17.16b, #2 + ext v23.16b, v20.16b, v21.16b, #2 + mul v18.8h, v19.8h, v0.h[1] + mul v22.8h, v23.8h, v0.h[1] +.irpc i, 23456 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr +.else // 8tap mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 @@ -1479,6 +1525,7 @@ L(\type\()_8tap_h): mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr +.endif subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 @@ -1523,6 +1570,26 @@ L(\type\()_8tap_h): uxtl v22.8h, v22.8b 16: +.ifc \taps, 6tap + ext v28.16b, v16.16b, v17.16b, #2 + ext v29.16b, v17.16b, v18.16b, #2 + ext v30.16b, v20.16b, v21.16b, #2 + ext v31.16b, v21.16b, v22.16b, #2 + mul v24.8h, v28.8h, v0.h[1] + mul v25.8h, v29.8h, v0.h[1] + mul v26.8h, v30.8h, v0.h[1] + mul v27.8h, v31.8h, v0.h[1] +.irpc i, 23456 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] @@ -1537,6 +1604,7 @@ L(\type\()_8tap_h): mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 @@ -1575,18 +1643,18 @@ L(\type\()_8tap_h): b.gt 161b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1595,7 +1663,7 @@ L(\type\()_8tap_v): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_v_tbl) + adr x9, L(\type\()_\taps\()_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1620,7 +1688,7 @@ L(\type\()_8tap_v): interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .4h + mul_mla_4tap v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret @@ -1630,7 +1698,7 @@ L(\type\()_8tap_v): interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret @@ -1655,7 +1723,7 @@ L(\type\()_8tap_v): interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 - mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f @@ -1673,7 +1741,7 @@ L(\type\()_8tap_v): load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 - mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: @@ -1698,13 +1766,13 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 - mul_mla_4 v7, v3, v4, v5, v6, .8h + mul_mla_4tap v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret @@ -1729,28 +1797,28 @@ L(\type\()_8tap_v): load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 - mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 - mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 + mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 - mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 + mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 - mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: @@ -1773,14 +1841,14 @@ L(\type\()_8tap_v): load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 - mul_mla_4 v6, v1, v2, v3, v4, .8h - mul_mla_4 v7, v2, v3, v4, v5, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h + mul_mla_4tap v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 - mul_mla_4 v1, v3, v4, v5, v6, .8h - mul_mla_4 v2, v4, v5, v6, v7, .8h + mul_mla_4tap v1, v3, v4, v5, v6, .8h + mul_mla_4tap v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret @@ -1809,32 +1877,32 @@ L(\type\()_8tap_v): subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 - mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 - mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 - mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 - mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 - mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 - mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: @@ -1882,10 +1950,10 @@ L(\type\()_8tap_v): uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b - mul_mla_4 v1, v16, v17, v18, v19, .8h - mul_mla_4 v16, v17, v18, v19, v20, .8h - mul_mla_4 v2, v23, v24, v25, v26, .8h - mul_mla_4 v17, v24, v25, v26, v27, .8h + mul_mla_4tap v1, v16, v17, v18, v19, .8h + mul_mla_4tap v16, v17, v18, v19, v20, .8h + mul_mla_4tap v2, v23, v24, v25, v26, .8h + mul_mla_4tap v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 @@ -1893,25 +1961,25 @@ L(\type\()_8tap_v): uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b - mul_mla_4 v1, v18, v19, v20, v21, .8h - mul_mla_4 v3, v19, v20, v21, v22, .8h - mul_mla_4 v2, v25, v26, v27, v28, .8h - mul_mla_4 v4, v26, v27, v28, v29, .8h + mul_mla_4tap v1, v18, v19, v20, v21, .8h + mul_mla_4tap v3, v19, v20, v21, v22, .8h + mul_mla_4tap v2, v25, v26, v27, v28, .8h + mul_mla_4tap v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_hv_tbl) + adr x9, L(\type\()_\taps\()_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv): addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv): addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v28.8b, #4 +.ifc \taps, 6tap + smull v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv): smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b @@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2): mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. @@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2): mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] +.ifc \taps, 6tap + srshr v18.4h, v31.4h, #2 +.else srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal v2.4s, v28.4h, v1.h[6] + smull v3.4s, v19.4h, v1.h[1] + smlal v3.4s, v20.4h, v1.h[2] + smlal v3.4s, v21.4h, v1.h[3] + smlal v3.4s, v22.4h, v1.h[4] + smlal v3.4s, v28.4h, v1.h[5] + smlal v3.4s, v29.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2): smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 @@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2): st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b @@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) +.ifc \taps, 6tap + mov v18.16b, v16.16b +.else + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v24.4h, v1.h[5] + smlal2 v5.4s, v24.8h, v1.h[5] + smlal v2.4s, v24.4h, v1.h[6] + smlal2 v3.4s, v24.8h, v1.h[6] + smlal v4.4s, v25.4h, v1.h[6] + smlal2 v5.4s, v25.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv @@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4): st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2398,15 +2537,33 @@ L(\type\()_8tap_filter_4): add \dst, \dst, #8 .else add \dst, \dst, #16 +.endif +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 .endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8_first): +L(\type\()_\taps\()_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b +.ifc \taps, 6tap + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mul v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] +.else // 8tap mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) @@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first): mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] +.endif srshr v16.8h, v16.8h, #2 ret -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b +.ifc \taps, 6tap + ext v26.16b, v28.16b, v29.16b, #2 + ext v27.16b, v30.16b, v31.16b, #2 + mul v24.8h, v26.8h, v0.h[1] + mul v25.8h, v27.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 @@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8): mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my @@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 -filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap +filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 + .macro load_filter_row dst, src, inc asr w13, \src, #10 diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S index 1bfb12ebb3..576fab158a 100644 --- a/third_party/dav1d/src/arm/64/mc16.S +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -1374,19 +1374,35 @@ endfunc sub \r3\wd, \r3\wd, \c\wd .endif .endm -.macro smull_smlal_4 d, s0, s1, s2, s3 +.macro smull_smlal_4tap d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm -.macro smull2_smlal2_4 d, s0, s1, s2, s3 +.macro smull2_smlal2_4tap d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm -.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] + smlal \d\().4s, \s4\().4h, v0.h[4] + smlal \d\().4s, \s5\().4h, v0.h[5] + smlal \d\().4s, \s6\().4h, v0.h[6] +.endm +.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] + smlal2 \d\().4s, \s4\().8h, v0.h[4] + smlal2 \d\().4s, \s5\().8h, v0.h[5] + smlal2 \d\().4s, \s6\().8h, v0.h[6] +.endm +.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] @@ -1396,7 +1412,7 @@ endfunc smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm -.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] @@ -1499,11 +1515,11 @@ endfunc st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_16bpc_neon, export=1 mov w9, \type_h mov w10, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1512,18 +1528,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps +function \type\()_\taps\()_neon .ifc \bdmax, w8 ldr w8, [sp] .endif @@ -1547,12 +1553,12 @@ function \type\()_8tap_neon add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1561,9 +1567,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x10, L(\type\()_8tap_h_tbl) + adr x10, L(\type\()_\taps\()_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1682,6 +1688,22 @@ L(\type\()_8tap_h): mov \mx, \w 8: +.ifc \taps, 6tap + ext v24.16b, v16.16b, v17.16b, #2 + ext v25.16b, v20.16b, v21.16b, #2 + smull v18.4s, v24.4h, v0.h[1] + smull2 v19.4s, v24.8h, v0.h[1] + smull v22.4s, v25.4h, v0.h[1] + smull2 v23.4s, v25.8h, v0.h[1] +.irpc i, 23456 + ext v24.16b, v16.16b, v17.16b, #(2*\i) + ext v25.16b, v20.16b, v21.16b, #(2*\i) + smlal v18.4s, v24.4h, v0.h[\i] + smlal2 v19.4s, v24.8h, v0.h[\i] + smlal v22.4s, v25.4h, v0.h[\i] + smlal2 v23.4s, v25.8h, v0.h[\i] +.endr +.else // 8tap smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] @@ -1694,6 +1716,7 @@ L(\type\()_8tap_h): smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr +.endif subs \mx, \mx, #8 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) @@ -1734,18 +1757,18 @@ L(\type\()_8tap_h): b.gt 81b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -1758,7 +1781,7 @@ L(\type\()_8tap_v): dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - adr x10, L(\type\()_8tap_v_tbl) + adr x10, L(\type\()_\taps\()_v_tbl) ldrh w9, [x10, x9, lsl #1] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1785,7 +1808,7 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f - smull_smlal_4 v6, v1, v2, v3, v4 + smull_smlal_4tap v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 @@ -1794,8 +1817,8 @@ L(\type\()_8tap_v): 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 - smull_smlal_4 v16, v1, v2, v3, v4 - smull_smlal_4 v17, v3, v4, v5, v6 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull_smlal_4tap v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 @@ -1817,8 +1840,8 @@ L(\type\()_8tap_v): subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 - smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 @@ -1836,7 +1859,7 @@ L(\type\()_8tap_v): 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 @@ -1860,13 +1883,13 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v6, v1, v2, v3, v4 - smull_smlal_4 v7, v2, v3, v4, v5 + smull_smlal_4tap v6, v1, v2, v3, v4 + smull_smlal_4tap v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v1, v3, v4, v5, v6 - smull_smlal_4 v2, v4, v5, v6, v7 + smull_smlal_4tap v1, v3, v4, v5, v6 + smull_smlal_4tap v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1885,10 +1908,10 @@ L(\type\()_8tap_v): 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 - smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 @@ -1903,8 +1926,8 @@ L(\type\()_8tap_v): b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1925,17 +1948,17 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v16, v1, v2, v3, v4 - smull2_smlal2_4 v17, v1, v2, v3, v4 - smull_smlal_4 v18, v2, v3, v4, v5 - smull2_smlal2_4 v19, v2, v3, v4, v5 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull2_smlal2_4tap v17, v1, v2, v3, v4 + smull_smlal_4tap v18, v2, v3, v4, v5 + smull2_smlal2_4tap v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v16, v3, v4, v5, v6 - smull2_smlal2_4 v17, v3, v4, v5, v6 - smull_smlal_4 v18, v4, v5, v6, v7 - smull2_smlal2_4 v19, v4, v5, v6, v7 + smull_smlal_4tap v16, v3, v4, v5, v6 + smull2_smlal2_4tap v17, v3, v4, v5, v6 + smull_smlal_4tap v18, v4, v5, v6, v7 + smull2_smlal2_4tap v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret @@ -1962,18 +1985,18 @@ L(\type\()_8tap_v): 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 - smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 + smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 - smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 - smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 - smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 + smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 + smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b @@ -2013,10 +2036,10 @@ L(\type\()_8tap_v): 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 - smull_smlal_4 v1, v16, v18, v20, v22 - smull2_smlal2_4 v2, v16, v18, v20, v22 - smull_smlal_4 v3, v17, v19, v21, v23 - smull2_smlal2_4 v4, v17, v19, v21, v23 + smull_smlal_4tap v1, v16, v18, v20, v22 + smull2_smlal2_4tap v2, v16, v18, v20, v22 + smull_smlal_4tap v3, v17, v19, v21, v23 + smull2_smlal2_4tap v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b @@ -2029,17 +2052,17 @@ L(\type\()_8tap_v): 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x11, \my, uxtw #3 - adr x10, L(\type\()_8tap_hv_tbl) + adr x10, L(\type\()_\taps\()_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv): addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores @@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv): mov v17.8b, v24.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v24.8b, #4 +.ifc \taps, 6tap + smull v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv): smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] +.endif srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s @@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 @@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2): // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2): // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). +.ifc \taps, 6tap + xtn v18.4h, v16.4s +.else xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v19.4h, v1.h[2] + smlal v3.4s, v20.4h, v1.h[3] + smlal v3.4s, v21.4h, v1.h[4] + smlal v3.4s, v22.4h, v1.h[5] + smlal v3.4s, v24.4h, v1.h[6] + smull v4.4s, v19.4h, v1.h[1] + smlal v4.4s, v20.4h, v1.h[2] + smlal v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v22.4h, v1.h[4] + smlal v4.4s, v24.4h, v1.h[5] + smlal v4.4s, v25.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2): smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] +.endif .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) @@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2): st1 {v3.d}[0], [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 @@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4): // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4): lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v26.16b, v27.16b, v28.16b, #2 + smull v24.4s, v26.4h, v0.h[1] + smull2 v25.4s, v26.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr +.else // 8tap smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 @@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4): smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr +.endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without @@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). +.ifc \taps, 6tap + uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 +.else uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v23.4h, v1.h[5] + smlal2 v5.4s, v23.8h, v1.h[5] + smlal v2.4s, v23.4h, v1.h[6] + smlal2 v3.4s, v23.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] +.endif .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) @@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4): st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4): mov \h, \my add \src, \src, #16 add \dst, \dst, #16 +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 +.endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v23.16b, v4.16b, v5.16b, #2 + ext v24.16b, v6.16b, v7.16b, #2 + smull v25.4s, v23.4h, v0.h[1] + smull2 v26.4s, v23.8h, v0.h[1] + smull v27.4s, v24.4h, v0.h[1] + smull2 v28.4s, v24.8h, v0.h[1] +.irpc i, 23456 + ext v23.16b, v4.16b, v5.16b, #(2*\i) + ext v24.16b, v6.16b, v7.16b, #(2*\i) + smlal v25.4s, v23.4h, v0.h[\i] + smlal2 v26.4s, v23.8h, v0.h[\i] + smlal v27.4s, v24.4h, v0.h[\i] + smlal2 v28.4s, v24.8h, v0.h[\i] +.endr +.else // 8tap smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] @@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8): smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr +.endif srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) @@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8): uzp1 v24.8h, v27.8h, v28.8h // Ditto ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] @@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 -filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap +filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 + .macro load_filter_row dst, src, inc asr w13, \src, #10 diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 3a6cf900a9..7bef9243fb 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -208,60 +208,66 @@ L(renorm): sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - mvn x7, x7 // ~dif - add x7, x7, x3, lsl #48 // ~dif + (v << 48) + sub x7, x7, x3, lsl #48 // dif - (v << 48) L(renorm2): lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d - lsl x7, x7, x5 // (~dif + (v << 48)) << d + lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] - mvn x7, x7 // ~dif - b.hs 9f + b.hs 4f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 - cmp x5, x4 - b.gt 2f - - ldr x3, [x3] // next_bits - add w8, w6, #23 // shift_bits = cnt + 23 - add w6, w6, #16 // cnt += 16 - rev x3, x3 // next_bits = bswap(next_bits) - sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 - and w8, w8, #24 // shift_bits &= 24 - lsr x3, x3, x8 // next_bits >>= shift_bits - sub w8, w8, w6 // shift_bits -= 16 + cnt - str x5, [x0, #BUF_POS] - lsl x3, x3, x8 // next_bits <<= shift_bits - mov w4, #48 - sub w6, w4, w8 // cnt = cnt + 64 - shift_bits - eor x7, x7, x3 // dif ^= next_bits - b 9f - -2: // refill_eob - mov w14, #40 - sub w5, w14, w6 // c = 40 - cnt -3: - cmp x3, x4 - b.ge 4f - ldrb w8, [x3], #1 - lsl x8, x8, x5 - eor x7, x7, x8 - subs w5, w5, #8 - b.ge 3b - -4: // refill_eob_end + subs x5, x5, x4 + b.hi 6f + + ldr x8, [x3] // next_bits + add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) + mvn x8, x8 + neg w5, w4 + rev x8, x8 // next_bits = bswap(next_bits) + lsr w5, w5, #3 // num_bytes_read + lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) + +2: // refill_end + add x3, x3, x5 + add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] - sub w6, w14, w5 // cnt = 40 - c -9: +3: // refill_end2 + orr x7, x7, x8 // dif |= next_bits + +4: // end str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 add sp, sp, #48 ret + +5: // pad_with_ones + add w8, w6, #-16 + ror x8, x8, x8 + b 3b + +6: // refill_eob + cmp x3, x4 + b.hs 5b + + ldr x8, [x4, #-8] + lsl w5, w5, #3 + lsr x8, x8, x5 + add w5, w6, #-48 + mvn x8, x8 + sub w4, w4, w3 // num_bytes_left + rev x8, x8 + lsr x8, x8, x5 + neg w5, w5 + lsr w5, w5, #3 + cmp w5, w4 + csel w5, w5, w4, lo // num_bytes_read + b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 @@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1 sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - mvn x7, x7 // ~dif - add x7, x7, x3, lsl #48 // ~dif + (v << 48) + sub x7, x7, x3, lsl #48 // dif - (v << 48) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d - lsl x7, x7, x5 // (~dif + (v << 48)) << d + lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] dup v3.4h, w4 - mvn x7, x7 // ~dif - b.hs 9f + b.hs 5f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 - cmp x5, x4 - b.gt 2f - - ldr x3, [x3] // next_bits - add w8, w6, #23 // shift_bits = cnt + 23 - add w6, w6, #16 // cnt += 16 - rev x3, x3 // next_bits = bswap(next_bits) - sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 - and w8, w8, #24 // shift_bits &= 24 - lsr x3, x3, x8 // next_bits >>= shift_bits - sub w8, w8, w6 // shift_bits -= 16 + cnt - str x5, [x0, #BUF_POS] - lsl x3, x3, x8 // next_bits <<= shift_bits - mov w4, #48 - sub w6, w4, w8 // cnt = cnt + 64 - shift_bits - eor x7, x7, x3 // dif ^= next_bits - b 9f - -2: // refill_eob - mov w14, #40 - sub w5, w14, w6 // c = 40 - cnt -3: - cmp x3, x4 - b.ge 4f - ldrb w8, [x3], #1 - lsl x8, x8, x5 - eor x7, x7, x8 - subs w5, w5, #8 - b.ge 3b - -4: // refill_eob_end + subs x5, x5, x4 + b.hi 7f + + ldr x8, [x3] // next_bits + add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) + mvn x8, x8 + neg w5, w4 + rev x8, x8 // next_bits = bswap(next_bits) + lsr w5, w5, #3 // num_bytes_read + lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) + +3: // refill_end + add x3, x3, x5 + add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] - sub w6, w14, w5 // cnt = 40 - c -9: +4: // refill_end2 + orr x7, x7, x8 // dif |= next_bits + +5: // end lsl w15, w15, #1 sub w15, w15, #5 lsr x12, x7, #48 @@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1 str x7, [x0, #DIF] lsr w0, w13, #1 ret + +6: // pad_with_ones + add w8, w6, #-16 + ror x8, x8, x8 + b 4b + +7: // refill_eob + cmp x3, x4 + b.hs 6b + + ldr x8, [x4, #-8] + lsl w5, w5, #3 + lsr x8, x8, x5 + add w5, w6, #-48 + mvn x8, x8 + sub w4, w4, w3 // num_bytes_left + rev x8, x8 + lsr x8, x8, x5 + neg w5, w5 + lsr w5, w5, #3 + cmp w5, w4 + csel w5, w5, w4, lo // num_bytes_read + b 3b endfunc function msac_decode_bool_equi_neon, export=1 @@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1 csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc @@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1 csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc @@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1 ldr w10, [x0, #ALLOW_UPDATE_CDF] clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 cbz w10, L(renorm2) diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S index 9013fd4b1e..1b3f319ce5 100644 --- a/third_party/dav1d/src/arm/64/util.S +++ b/third_party/dav1d/src/arm/64/util.S @@ -32,6 +32,10 @@ #include "config.h" #include "src/arm/asm.S" +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 @@ -51,6 +55,10 @@ adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif +#elif __has_feature(hwaddress_sanitizer) + adrp \rd, :pg_hi21_nc:\val+(\offset) + movk \rd, #:prel_g3:\val+0x100000000 + add \rd, \rd, :lo12:\val+(\offset) #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) @@ -149,6 +157,35 @@ trn2 \r7\().2d, \t9\().2d, \r7\().2d .endm +.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7 + trn1 \t8\().8h, \r0\().8h, \r1\().8h + trn2 \t9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \t9\().4s, \r3\().4s + trn2 \t9\().4s, \t9\().4s, \r3\().4s + trn1 \r3\().4s, \t8\().4s, \r1\().4s + trn2 \t8\().4s, \t8\().4s, \r1\().4s + + trn1 \o0\().2d, \r3\().2d, \r4\().2d + trn2 \o4\().2d, \r3\().2d, \r4\().2d + trn1 \o1\().2d, \r5\().2d, \r6\().2d + trn2 \o5\().2d, \r5\().2d, \r6\().2d + trn2 \o6\().2d, \t8\().2d, \r2\().2d + trn1 \o2\().2d, \t8\().2d, \r2\().2d + trn1 \o3\().2d, \t9\().2d, \r7\().2d + trn2 \o7\().2d, \t9\().2d, \r7\().2d +.endm + .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().16b, \r0\().16b, \r1\().16b trn2 \t9\().16b, \r0\().16b, \r1\().16b @@ -226,4 +263,16 @@ trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm +.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \o0\().4s, \t4\().4s, \t6\().4s + trn2 \o2\().4s, \t4\().4s, \t6\().4s + trn1 \o1\().4s, \t5\().4s, \t7\().4s + trn2 \o3\().4s, \t5\().4s, \t7\().4s +.endm + #endif /* DAV1D_SRC_ARM_64_UTIL_S */ diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S index dc50415f1f..fed73b3048 100644 --- a/third_party/dav1d/src/arm/asm.S +++ b/third_party/dav1d/src/arm/asm.S @@ -34,6 +34,50 @@ #define x18 do_not_use_x18 #define w18 do_not_use_w18 +#if HAVE_AS_ARCH_DIRECTIVE + .arch AS_ARCH_LEVEL +#endif + +#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE +#define ENABLE_DOTPROD .arch_extension dotprod +#define DISABLE_DOTPROD .arch_extension nodotprod +#else +#define ENABLE_DOTPROD +#define DISABLE_DOTPROD +#endif +#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE +#define ENABLE_I8MM .arch_extension i8mm +#define DISABLE_I8MM .arch_extension noi8mm +#else +#define ENABLE_I8MM +#define DISABLE_I8MM +#endif +#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE +#define ENABLE_SVE .arch_extension sve +#define DISABLE_SVE .arch_extension nosve +#else +#define ENABLE_SVE +#define DISABLE_SVE +#endif +#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE +#define ENABLE_SVE2 .arch_extension sve2 +#define DISABLE_SVE2 .arch_extension nosve2 +#else +#define ENABLE_SVE2 +#define DISABLE_SVE2 +#endif + +/* If we do support the .arch_extension directives, disable support for all + * the extensions that we may use, in case they were implicitly enabled by + * the .arch level. This makes it clear if we try to assemble an instruction + * from an unintended extension set; we only allow assmbling such instructions + * within regions where we explicitly enable those extensions. */ +DISABLE_DOTPROD +DISABLE_I8MM +DISABLE_SVE +DISABLE_SVE2 + + /* Support macros for * - Armv8.3-A Pointer Authentication and * - Armv8.5-A Branch Target Identification diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c index b7a0d3adbc..d9b1751a6a 100644 --- a/third_party/dav1d/src/arm/cpu.c +++ b/third_party/dav1d/src/arm/cpu.c @@ -31,22 +31,95 @@ #include "src/arm/cpu.h" -#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 -// NEON is always available; runtime tests are not needed. -#elif defined(HAVE_GETAUXVAL) && ARCH_ARM +#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) #include +#if ARCH_AARCH64 + +#define HWCAP_AARCH64_ASIMDDP (1 << 20) +#define HWCAP_AARCH64_SVE (1 << 22) +#define HWCAP2_AARCH64_SVE2 (1 << 1) +#define HWCAP2_AARCH64_I8MM (1 << 13) + +COLD unsigned dav1d_get_cpu_flags_arm(void) { +#ifdef HAVE_GETAUXVAL + unsigned long hw_cap = getauxval(AT_HWCAP); + unsigned long hw_cap2 = getauxval(AT_HWCAP2); +#else + unsigned long hw_cap = 0; + unsigned long hw_cap2 = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); + elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2)); +#endif + + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; + flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0; + flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0; + return flags; +} +#else /* !ARCH_AARCH64 */ + #ifndef HWCAP_ARM_NEON -#define HWCAP_ARM_NEON (1 << 12) +#define HWCAP_ARM_NEON (1 << 12) #endif -#define NEON_HWCAP HWCAP_ARM_NEON +#define HWCAP_ARM_ASIMDDP (1 << 24) +#define HWCAP_ARM_I8MM (1 << 27) -#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM -#include +COLD unsigned dav1d_get_cpu_flags_arm(void) { +#ifdef HAVE_GETAUXVAL + unsigned long hw_cap = getauxval(AT_HWCAP); +#else + unsigned long hw_cap = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); +#endif + + unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; + return flags; +} +#endif /* ARCH_AARCH64 */ + +#elif defined(__APPLE__) +#include + +static int have_feature(const char *feature) { + int supported = 0; + size_t size = sizeof(supported); + if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) { + return 0; + } + return supported; +} + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + if (have_feature("hw.optional.arm.FEAT_DotProd")) + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; + if (have_feature("hw.optional.arm.FEAT_I8MM")) + flags |= DAV1D_ARM_CPU_FLAG_I8MM; + /* No SVE and SVE2 feature detection available on Apple platforms. */ + return flags; +} + +#elif defined(_WIN32) +#include -#define NEON_HWCAP HWCAP_NEON +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; +#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; +#endif + /* No I8MM or SVE feature detection available on Windows at the time of + * writing. */ + return flags; +} #elif defined(__ANDROID__) +#include #include #include @@ -58,18 +131,25 @@ static unsigned parse_proc_cpuinfo(const char *flag) { char line_buffer[120]; const char *line; + size_t flaglen = strlen(flag); while ((line = fgets(line_buffer, sizeof(line_buffer), file))) { - if (strstr(line, flag)) { - fclose(file); - return 1; + // check all occurances as whole words + const char *found = line; + while ((found = strstr(found, flag))) { + if ((found == line_buffer || !isgraph(found[-1])) && + (isspace(found[flaglen]) || feof(file))) { + fclose(file); + return 1; + } + found += flaglen; } // if line is incomplete seek back to avoid splitting the search // string into two buffers - if (!strchr(line, '\n') && strlen(line) > strlen(flag)) { + if (!strchr(line, '\n') && strlen(line) > flaglen) { // use fseek since the 64 bit fseeko is only available since // Android API level 24 and meson defines _FILE_OFFSET_BITS // by default 64 - if (fseek(file, -strlen(flag), SEEK_CUR)) + if (fseek(file, -flaglen, SEEK_CUR)) break; } } @@ -78,22 +158,23 @@ static unsigned parse_proc_cpuinfo(const char *flag) { return 0; } -#endif COLD unsigned dav1d_get_cpu_flags_arm(void) { - unsigned flags = 0; -#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 - flags |= DAV1D_ARM_CPU_FLAG_NEON; -#elif defined(HAVE_GETAUXVAL) && ARCH_ARM - unsigned long hw_cap = getauxval(AT_HWCAP); - flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM - unsigned long hw_cap = 0; - elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); - flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#elif defined(__ANDROID__) - flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#endif - + unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0; +#if ARCH_AARCH64 + flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0; + flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0; +#endif /* ARCH_AARCH64 */ return flags; } + +#else /* Unsupported OS */ + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + return 0; +} + +#endif diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h index 8c10a1b6b0..de9bde6ccf 100644 --- a/third_party/dav1d/src/arm/cpu.h +++ b/third_party/dav1d/src/arm/cpu.h @@ -30,6 +30,10 @@ enum CpuFlags { DAV1D_ARM_CPU_FLAG_NEON = 1 << 0, + DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1, + DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2, + DAV1D_ARM_CPU_FLAG_SVE = 1 << 3, + DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4, }; unsigned dav1d_get_cpu_flags_arm(void); diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h index 2ecd086b3b..17234e027a 100644 --- a/third_party/dav1d/src/arm/itx.h +++ b/third_party/dav1d/src/arm/itx.h @@ -117,9 +117,11 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon); + if (BITDEPTH == 16 && bpc != 10) return; - assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); assign_itx16_fn(R, 8, 4, neon); diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h index 9db0bf86ae..6eee0da424 100644 --- a/third_party/dav1d/src/arm/msac.h +++ b/third_party/dav1d/src/arm/msac.h @@ -39,7 +39,7 @@ unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); -#if ARCH_AARCH64 || defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon -- cgit v1.2.3