From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/arm/32/itx.S | 79 +++++++++++------ third_party/dav1d/src/arm/32/itx16.S | 19 ++-- third_party/dav1d/src/arm/32/msac.S | 167 +++++++++++++++++++---------------- 3 files changed, 155 insertions(+), 110 deletions(-) (limited to 'third_party/dav1d/src/arm/32') diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S index ceea025e45..9ba1df7a68 100644 --- a/third_party/dav1d/src/arm/32/itx.S +++ b/third_party/dav1d/src/arm/32/itx.S @@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x8_epilog) .else blx r4 @@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon vrshr.s16 q13, q13, #1 vrshr.s16 q14, q14, #1 vrshr.s16 q15, q15, #1 -.endif +L(itx_8x8_epilog): transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 blx r5 @@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 @@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon .else identity_4x16_shift1 d0[0] .endif + b L(horz_16x4_epilog) .else blx r4 -.endif -.if \shift > 0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #\shift .endr -.endif +.if \shift == 1 + b L(horz_16x4_epilog) +.else +L(horz_16x4_epilog): transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 @@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon .endr pop {pc} +.endif +.endif endfunc .endm -def_horz_16 scale=0, identity=0, shift=2 -def_horz_16 scale=1, identity=0, shift=1, suffix=_scale -def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} @@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon .endr identity_4x16_shift1 d0[0] + + b L(itx_16x4_epilog) .else vmov.i16 q2, #0 vmov.i16 q3, #0 @@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon vswp d19, d22 vswp d18, d20 vswp d19, d21 -.irp i, q8, q9, q10, q11 + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #1 .endr -.endif + +L(itx_16x4_epilog): transpose_4x8h q8, q9, q10, q11 blx r5 mov r6, r0 load_add_store_8x4 r6, r7 -.ifc \variant, identity_ vmov q8, q12 vmov q9, q13 vmov q10, q14 vmov q11, q15 -.else - vswp d25, d28 - vswp d27, d30 - vswp d26, d28 - vswp d27, d29 - vrshr.s16 q8, q12, #1 - vrshr.s16 q9, q13, #1 - vrshr.s16 q10, q14, #1 - vrshr.s16 q11, q15, #1 -.endif + transpose_4x8h q8, q9, q10, q11 blx r5 add r6, r0, #8 @@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q8, q9, q10, q11, d0[0] + + b L(itx_4x16_epilog) .else blx r4 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr -.endif +L(itx_4x16_epilog): transpose_4x8h q8, q9, q10, q11 vswp d19, d21 vswp d18, d20 @@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_416_base def_fn_416_base identity_ +def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 4 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon mov r10, #\eob_half .else +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity @@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32 def_fns_416 4, 16 def_fns_416 16, 4 -.macro def_fn_816_base variant -function inv_txfm_\variant\()add_16x8_neon +function inv_txfm_add_16x8_neon sub_sp_align 256 .irp i, 0, 4 @@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon pop {r4-r11,pc} endfunc +.macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_neon sub_sp_align 256 @@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon .endr 2: +.ifc \variant, identity_ + b L(itx_8x16_epilog) +.else +L(itx_8x16_epilog): .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) @@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_816_base def_fn_816_base identity_ +def_fn_816_base + +/* Define symbols used in .if statement */ +.equ dct, 1 +.equ identity, 2 +.equ adst, 3 +.equ flipadst, 4 .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 8 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x8_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .else .ifc \txfm1, identity @@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .else mov r10, #\eob_4x4 .endif -.ifc \txfm1, identity +.if \w == 8 && \txfm1 == identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S index aa6c272e71..7691272517 100644 --- a/third_party/dav1d/src/arm/32/itx16.S +++ b/third_party/dav1d/src/arm/32/itx16.S @@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q14, q15}, [r2, :128]! - vshr.s16 q8, q8, #2 + vshr.s32 q8, q8, #2 vld1.32 {q10, q11}, [r2, :128] - vshr.s16 q9, q9, #2 - vshr.s16 q10, q10, #2 - vshr.s16 q11, q11, #2 + vshr.s32 q9, q9, #2 + vshr.s32 q10, q10, #2 + vshr.s32 q11, q11, #2 iwht4 @@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon vld1.16 {d3}, [r0, :64], r1 L(itx_4x4_end): - vmvn.i16 q15, #0xfc00 // 0x3ff + // read bitdepth_max from the callers stack + ldr r4, [sp, #44] + vdup.i16 q15, r4 sub r0, r0, r1, lsl #2 vqadd.s16 q8, q8, q0 vqadd.s16 q9, q9, q1 @@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon vqrshrn.s32 d21, q13, #\shift vqrshrn.s32 d22, q14, #\shift vqrshrn.s32 d23, q15, #\shift +.if \scale + b L(horz_16x2_epilog) +.else +L(horz_16x2_epilog): vuzp.16 q8, q9 vuzp.16 q10, q11 @@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon .endr pop {pc} +.endif endfunc .endm -def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale +def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S index b06e109dda..b16957fb7e 100644 --- a/third_party/dav1d/src/arm/32/msac.S +++ b/third_party/dav1d/src/arm/32/msac.S @@ -279,60 +279,67 @@ L(renorm): sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) L(renorm2): lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] - mvn r7, r7 // ~dif - bhs 9f + bhs 4f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 8 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 6f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +2: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 8 - c -9: +3: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +4: // end str r6, [r0, #CNT] str r7, [r0, #DIF] - mov r0, lr add sp, sp, #48 - pop {r4-r10,pc} + +5: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 3b + +6: // refill_eob + cmp r3, r4 + bhs 5b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 @@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1 sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] vdup.16 d1, r4 - mvn r7, r7 // ~dif - bhs 9f + bhs 5f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 40 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 7f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +3: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 40 - c -9: +4: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +5: // end lsl lr, lr, #1 sub lr, lr, #5 lsr r12, r7, #16 @@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1 str r7, [r0, #DIF] lsr r0, r2, #1 pop {r4-r10,pc} + +6: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 4b + +7: // refill_eob + cmp r3, r4 + bhs 6b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 3b endfunc function msac_decode_bool_equi_neon, export=1 @@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1 cmp r10, #0 clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 -- cgit v1.2.3