diff options
Diffstat (limited to 'third_party/dav1d')
44 files changed, 2867 insertions, 1293 deletions
diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS index 3645474a04..88b1eea00e 100644 --- a/third_party/dav1d/NEWS +++ b/third_party/dav1d/NEWS @@ -1,3 +1,15 @@ +Changes for 1.4.1 'Road Runner': +-------------------------------- + +1.4.1 is a small release of dav1d, improving notably ARM and RISC-V speed + +- Optimizations for 6tap filters for NEON (ARM) +- More RISC-V optimizations for itx (4x8, 8x4, 4x16, 16x4, 8x16, 16x8) +- Reduction of binary size on ARM64, ARM32 and RISC-V +- Fix out-of-bounds read in 8bpc SSE2/SSSE3 wiener_filter +- Msac optimizations + + Changes for 1.4.0 'Road Runner': -------------------------------- @@ -26,7 +38,7 @@ Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)': Changes for 1.2.1 'Arctic Peregrine Falcon': -------------------------------------------- +-------------------------------------------- 1.2.1 is a small release of dav1d, adding more SIMD and fixes @@ -42,7 +54,7 @@ Changes for 1.2.1 'Arctic Peregrine Falcon': Changes for 1.2.0 'Arctic Peregrine Falcon': -------------------------------------------- +-------------------------------------------- 1.2.0 is a small release of dav1d, adding more SIMD and fixes @@ -55,7 +67,7 @@ Changes for 1.2.0 'Arctic Peregrine Falcon': Changes for 1.1.0 'Arctic Peregrine Falcon': -------------------------------------------- +-------------------------------------------- 1.1.0 is an important release of dav1d, fixing numerous bugs, and adding SIMD diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md index 4fc8d27f14..b7aa200d0e 100644 --- a/third_party/dav1d/THANKS.md +++ b/third_party/dav1d/THANKS.md @@ -16,19 +16,20 @@ The Alliance for Open Media (AOM) for partially funding this project. And all the dav1d Authors (git shortlog -sn), including: -Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer, -Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, -Jean-Baptiste Kempf, Luc Trudeau, Hugo Beauzée-Luyssen, Konstantin Pavlov, -Niklas Haas, David Michael Barr, Steve Lhomme, Nathan E. Egge, Wan-Teh Chang, -Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Luca Barbato, -David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer, -Xuefeng Jiang, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, -Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu, -Thomas Daede, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, -Salome Thirot, SmilingWolf, Tristan Laurent, Vittorio Giovara, Yannis Guyon, -André Kempe, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov, -Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth, -Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav -Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND, -Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens, -Xu Guangxin, kossh1 and skal. +Henrik Gramner, Martin Storsjö, Ronald S. Bultje, Janne Grunau, James Almer, +Victorien Le Couviour--Tuffet, Matthias Dressel, Nathan E. Egge, +Jean-Baptiste Kempf, Marvin Scholz, Luc Trudeau, Niklas Haas, +Hugo Beauzée-Luyssen, Konstantin Pavlov, David Michael Barr, Steve Lhomme, +yuanhecai, Luca Barbato, Wan-Teh Chang, Kyle Siefring, B Krishnan Iyer, +Francois Cartegnie, Liwei Wang, David Conrad, Derek Buitenhuis, Jan Beich, +Michael Bradshaw, Raphaël Zumer, Xuefeng Jiang, Arpad Panyik, Christophe Gisquet, +Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, Raphael Zumer, +Rupert Swarbrick, Thierry Foucu, Thomas Daede, jinbo, André Kempe, Colin Lee, +Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, +Tristan Laurent, Tristan Matthews, Vittorio Giovara, Yannis Guyon, +Andrey Semashev, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov, +Ewout ter Hoeven, Fred Barbier, Hao Chen, Jean-Yves Avenard, Joe Drago, +Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, +Pablo Stebler, Rostislav Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, +Sylvain BERTRAND, Sylvestre Ledru, Timo Gurr, Vibhoothi, +Vignesh Venkatasubramanian, Xavier Claessens, Xu Guangxin, kossh1 and skal. diff --git a/third_party/dav1d/gcovr.cfg b/third_party/dav1d/gcovr.cfg index d09a0ecab5..e02ae33c33 100644 --- a/third_party/dav1d/gcovr.cfg +++ b/third_party/dav1d/gcovr.cfg @@ -1,4 +1,4 @@ exclude = .*/tests/.* exclude = .*/tools/.* exclude = .*/include/common/dump.h -gcov-ignore-parse-errors = yes +gcov-ignore-parse-errors = negative_hits.warn diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index 6e49852103..e371415d53 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -23,7 +23,7 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '1.4.0', + version: '1.4.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', @@ -309,6 +309,10 @@ if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang optional_arguments += '-fno-stack-check' endif +if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm')) + optional_arguments += '-fno-align-functions' +endif + add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c') add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c') @@ -365,6 +369,66 @@ if (is_asm_enabled and if cc.compiles(check_pic_code) cdata.set('PIC', '3') endif + + if host_machine.cpu_family() == 'aarch64' + have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''') + cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch) + as_arch_str = '' + if have_as_arch + as_arch_level = 'armv8-a' + # Check what .arch levels are supported. In principle, we only + # want to detect up to armv8.2-a here (binutils requires that + # in order to enable i8mm). However, older Clang versions + # (before Clang 17, and Xcode versions up to and including 15.0) + # didn't support controlling dotprod/i8mm extensions via + # .arch_extension, therefore try to enable a high enough .arch + # level as well, to implicitly make them available via that. + foreach arch : ['armv8.2-a', 'armv8.4-a', 'armv8.6-a'] + if cc.compiles('__asm__ (".arch ' + arch + '\\n");') + as_arch_level = arch + endif + endforeach + # Clang versions before 17 also had a bug + # (https://github.com/llvm/llvm-project/issues/32220) + # causing a plain ".arch <level>" to not have any effect unless it + # had an extra "+<feature>" included - but it was activated on the + # next ".arch_extension" directive instead. Check if we can include + # "+crc" as dummy feature to make the .arch directive behave as + # expected and take effect right away. + if cc.compiles('__asm__ (".arch ' + as_arch_level + '+crc\\n");') + as_arch_level = as_arch_level + '+crc' + endif + cdata.set('AS_ARCH_LEVEL', as_arch_level) + as_arch_str = '".arch ' + as_arch_level + '\\n"' + endif + extensions = { + 'dotprod': 'udot v0.4s, v0.16b, v0.16b', + 'i8mm': 'usdot v0.4s, v0.16b, v0.16b', + 'sve': 'whilelt p0.s, x0, x1', + 'sve2': 'sqrdmulh z0.s, z0.s, z0.s', + } + foreach name, instr : extensions + # Test for support for the various extensions. First test if + # the assembler supports the .arch_extension directive for + # enabling/disabling the extension, then separately check whether + # the instructions themselves are supported. Even if .arch_extension + # isn't supported, we may be able to assemble the instructions + # if the .arch level includes support for them. + code = '__asm__ (' + as_arch_str + code += '".arch_extension ' + name + '\\n"' + code += ');' + supports_archext = cc.compiles(code) + cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext) + code = '__asm__ (' + as_arch_str + if supports_archext + code += '".arch_extension ' + name + '\\n"' + endif + code += '"' + instr + '\\n"' + code += ');' + supports_instr = cc.compiles(code, name: name.to_upper()) + cdata.set10('HAVE_' + name.to_upper(), supports_instr) + endforeach + endif endif cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86')) @@ -477,6 +541,17 @@ if (is_asm_enabled and ]) endif +if is_asm_enabled and host_machine.cpu_family().startswith('riscv') + as_option_code = '''__asm__ ( +".option arch, +v\n" +"vsetivli zero, 0, e8, m1, ta, ma" +); +''' + if not cc.compiles(as_option_code, name : 'RISC-V Vector') + error('Compiler doesn\'t support \'.option arch\' asm directive. Update to binutils>=2.38 or clang>=17 or use \'-Denable_asm=false\'.') + endif +endif + # Generate config.h config_h_target = configure_file(output: 'config.h', configuration: cdata) diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S index ceea025e45..9ba1df7a68 100644 --- a/third_party/dav1d/src/arm/32/itx.S +++ b/third_party/dav1d/src/arm/32/itx.S @@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x8_epilog) .else blx r4 @@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon vrshr.s16 q13, q13, #1 vrshr.s16 q14, q14, #1 vrshr.s16 q15, q15, #1 -.endif +L(itx_8x8_epilog): transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 blx r5 @@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 @@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon .else identity_4x16_shift1 d0[0] .endif + b L(horz_16x4_epilog) .else blx r4 -.endif -.if \shift > 0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #\shift .endr -.endif +.if \shift == 1 + b L(horz_16x4_epilog) +.else +L(horz_16x4_epilog): transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 @@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon .endr pop {pc} +.endif +.endif endfunc .endm -def_horz_16 scale=0, identity=0, shift=2 -def_horz_16 scale=1, identity=0, shift=1, suffix=_scale -def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} @@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon .endr identity_4x16_shift1 d0[0] + + b L(itx_16x4_epilog) .else vmov.i16 q2, #0 vmov.i16 q3, #0 @@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon vswp d19, d22 vswp d18, d20 vswp d19, d21 -.irp i, q8, q9, q10, q11 + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #1 .endr -.endif + +L(itx_16x4_epilog): transpose_4x8h q8, q9, q10, q11 blx r5 mov r6, r0 load_add_store_8x4 r6, r7 -.ifc \variant, identity_ vmov q8, q12 vmov q9, q13 vmov q10, q14 vmov q11, q15 -.else - vswp d25, d28 - vswp d27, d30 - vswp d26, d28 - vswp d27, d29 - vrshr.s16 q8, q12, #1 - vrshr.s16 q9, q13, #1 - vrshr.s16 q10, q14, #1 - vrshr.s16 q11, q15, #1 -.endif + transpose_4x8h q8, q9, q10, q11 blx r5 add r6, r0, #8 @@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q8, q9, q10, q11, d0[0] + + b L(itx_4x16_epilog) .else blx r4 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr -.endif +L(itx_4x16_epilog): transpose_4x8h q8, q9, q10, q11 vswp d19, d21 vswp d18, d20 @@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_416_base def_fn_416_base identity_ +def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 4 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon mov r10, #\eob_half .else +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon +.endif movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity @@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32 def_fns_416 4, 16 def_fns_416 16, 4 -.macro def_fn_816_base variant -function inv_txfm_\variant\()add_16x8_neon +function inv_txfm_add_16x8_neon sub_sp_align 256 .irp i, 0, 4 @@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon pop {r4-r11,pc} endfunc +.macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_neon sub_sp_align 256 @@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon .endr 2: +.ifc \variant, identity_ + b L(itx_8x16_epilog) +.else +L(itx_8x16_epilog): .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) @@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} +.endif endfunc .endm -def_fn_816_base def_fn_816_base identity_ +def_fn_816_base + +/* Define symbols used in .if statement */ +.equ dct, 1 +.equ identity, 2 +.equ adst, 3 +.equ flipadst, 4 .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} .if \w == 8 +.ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x8_neon +.endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .else .ifc \txfm1, identity @@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .else mov r10, #\eob_4x4 .endif -.ifc \txfm1, identity +.if \w == 8 && \txfm1 == identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S index aa6c272e71..7691272517 100644 --- a/third_party/dav1d/src/arm/32/itx16.S +++ b/third_party/dav1d/src/arm/32/itx16.S @@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q14, q15}, [r2, :128]! - vshr.s16 q8, q8, #2 + vshr.s32 q8, q8, #2 vld1.32 {q10, q11}, [r2, :128] - vshr.s16 q9, q9, #2 - vshr.s16 q10, q10, #2 - vshr.s16 q11, q11, #2 + vshr.s32 q9, q9, #2 + vshr.s32 q10, q10, #2 + vshr.s32 q11, q11, #2 iwht4 @@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon vld1.16 {d3}, [r0, :64], r1 L(itx_4x4_end): - vmvn.i16 q15, #0xfc00 // 0x3ff + // read bitdepth_max from the callers stack + ldr r4, [sp, #44] + vdup.i16 q15, r4 sub r0, r0, r1, lsl #2 vqadd.s16 q8, q8, q0 vqadd.s16 q9, q9, q1 @@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon vqrshrn.s32 d21, q13, #\shift vqrshrn.s32 d22, q14, #\shift vqrshrn.s32 d23, q15, #\shift +.if \scale + b L(horz_16x2_epilog) +.else +L(horz_16x2_epilog): vuzp.16 q8, q9 vuzp.16 q10, q11 @@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon .endr pop {pc} +.endif endfunc .endm -def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale +def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S index b06e109dda..b16957fb7e 100644 --- a/third_party/dav1d/src/arm/32/msac.S +++ b/third_party/dav1d/src/arm/32/msac.S @@ -279,60 +279,67 @@ L(renorm): sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) L(renorm2): lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] - mvn r7, r7 // ~dif - bhs 9f + bhs 4f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 8 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 6f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +2: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 8 - c -9: +3: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +4: // end str r6, [r0, #CNT] str r7, [r0, #DIF] - mov r0, lr add sp, sp, #48 - pop {r4-r10,pc} + +5: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 3b + +6: // refill_eob + cmp r3, r4 + bhs 5b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 @@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1 sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 - mvn r7, r7 // ~dif - add r7, r7, r3, lsl #16 // ~dif + (v << 16) + sub r7, r7, r3, lsl #16 // dif - (v << 16) lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d - lsl r7, r7, r5 // (~dif + (v << 16)) << d + lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] vdup.16 d1, r4 - mvn r7, r7 // ~dif - bhs 9f + bhs 5f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 - cmp r5, r4 - bgt 2f - - ldr r3, [r3] // next_bits - add r8, r6, #23 // shift_bits = cnt + 23 - add r6, r6, #16 // cnt += 16 - rev r3, r3 // next_bits = bswap(next_bits) - sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 - and r8, r8, #24 // shift_bits &= 24 - lsr r3, r3, r8 // next_bits >>= shift_bits - sub r8, r8, r6 // shift_bits -= 16 + cnt - str r5, [r0, #BUF_POS] - lsl r3, r3, r8 // next_bits <<= shift_bits - rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits - eor r7, r7, r3 // dif ^= next_bits - b 9f - -2: // refill_eob - rsb r5, r6, #8 // c = 40 - cnt -3: - cmp r3, r4 - bge 4f - ldrb r8, [r3], #1 - lsl r8, r8, r5 - eor r7, r7, r8 - subs r5, r5, #8 - bge 3b - -4: // refill_eob_end + subs r5, r5, r4 + bhi 7f + + ldr r8, [r3] // next_bits + rsb r5, r6, #16 + add r4, r6, #16 // shift_bits = cnt + 16 + mvn r8, r8 + lsr r5, r5, #3 // num_bytes_read + rev r8, r8 // next_bits = bswap(next_bits) + lsr r8, r8, r4 // next_bits >>= shift_bits + +3: // refill_end + add r3, r3, r5 + add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] - rsb r6, r5, #8 // cnt = 40 - c -9: +4: // refill_end2 + orr r7, r7, r8 // dif |= next_bits + +5: // end lsl lr, lr, #1 sub lr, lr, #5 lsr r12, r7, #16 @@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1 str r7, [r0, #DIF] lsr r0, r2, #1 pop {r4-r10,pc} + +6: // pad_with_ones + add r8, r6, #-240 + lsr r8, r8, r8 + b 4b + +7: // refill_eob + cmp r3, r4 + bhs 6b + + ldr r8, [r4, #-4] + lsl r5, r5, #3 + lsr r8, r8, r5 + add r5, r6, #16 + mvn r8, r8 + sub r4, r4, r3 // num_bytes_left + rev r8, r8 + lsr r8, r8, r5 + rsb r5, r6, #16 + lsr r5, r5, #3 + cmp r5, r4 + it hs + movhs r5, r4 + b 3b endfunc function msac_decode_bool_equi_neon, export=1 @@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1 movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) @@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1 cmp r10, #0 clz r5, r4 // clz(rng) - mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S index 53490cd677..7063cbde1d 100644 --- a/third_party/dav1d/src/arm/64/itx.S +++ b/third_party/dav1d/src/arm/64/itx.S @@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x8_epilog) .else blr x4 @@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon srshr v21.8h, v21.8h, #1 srshr v22.8h, v22.8h, #1 srshr v23.8h, v23.8h, #1 -.endif +L(itx_8x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 blr x5 load_add_store_8x8 x0, x7 ret x15 +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 @@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon .endif .if \identity identity_8x16_shift2 v0.h[0] + b L(horz_16x8_epilog) .else blr x4 -.endif -.if \shift > 0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #\shift .endr -.endif +.if \shift == 1 + b L(horz_16x8_epilog) +.else +L(horz_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 @@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon .endr ret x14 +.endif +.endif endfunc .endm -def_horz_16 scale=0, identity=0, shift=2 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=1, shift=0, suffix=_identity +def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 @@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon .endr identity_8x16_shift1 v0.h[0] + + b L(itx_16x4_epilog) .else .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h ld1 {\i}, [x2] @@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr -.endif - transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 - blr x5 - mov x6, x0 - load_add_store_8x4 x6, x7 -.ifc \variant, identity_ - mov v16.16b, v20.16b - mov v17.16b, v21.16b - mov v18.16b, v22.16b - mov v19.16b, v23.16b -.else ins v24.d[1], v28.d[0] ins v25.d[1], v29.d[0] ins v26.d[1], v30.d[0] ins v27.d[1], v31.d[0] - srshr v16.8h, v24.8h, #1 - srshr v17.8h, v25.8h, #1 - srshr v18.8h, v26.8h, #1 - srshr v19.8h, v27.8h, #1 -.endif + srshr v20.8h, v24.8h, #1 + srshr v21.8h, v25.8h, #1 + srshr v22.8h, v26.8h, #1 + srshr v23.8h, v27.8h, #1 + +L(itx_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19 + blr x5 add x6, x0, #8 load_add_store_8x4 x6, x7 ret x15 +.endif endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] + + b L(itx_4x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr -.endif +L(itx_4x16_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] @@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon load_add_store_4x16 x0, x6 ret x15 +.endif endfunc .endm -def_fn_416_base def_fn_416_base identity_ +def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 @@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 idct_dc \w, \h, 1 .endif .if \w == 4 +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_4h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity @@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon mov w16, #2*(5793-4096)*8 dup v0.4h, w16 identity_8x16_shift1 v0.h[0] + + b L(itx_16x8_epilog) .else blr x4 -.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #1 .endr -.endif + +L(itx_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 @@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon mov x6, x0 load_add_store_8x8 x6, x7 -.ifc \variant, identity_ - mov v16.16b, v24.16b - mov v17.16b, v25.16b - mov v18.16b, v26.16b - mov v19.16b, v27.16b - mov v20.16b, v28.16b - mov v21.16b, v29.16b - mov v22.16b, v30.16b - mov v23.16b, v31.16b -.else - srshr v16.8h, v24.8h, #1 - srshr v17.8h, v25.8h, #1 - srshr v18.8h, v26.8h, #1 - srshr v19.8h, v27.8h, #1 - srshr v20.8h, v28.8h, #1 - srshr v21.8h, v29.8h, #1 - srshr v22.8h, v30.8h, #1 - srshr v23.8h, v31.8h, #1 -.endif - - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23 blr x5 @@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon load_add_store_8x8 x0, x7 ret x15 +.endif endfunc function inv_txfm_\variant\()add_8x16_neon @@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out + + b L(itx_8x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h srshr \i, \i, #1 .endr -.endif +L(itx_8x16_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 @@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon load_add_store_8x16 x0, x6 ret x15 +.endif endfunc .endm -def_fn_816_base def_fn_816_base identity_ +def_fn_816_base .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif +.ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon +.endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S index eee3a9636d..31ee9be1b4 100644 --- a/third_party/dav1d/src/arm/64/itx16.S +++ b/third_party/dav1d/src/arm/64/itx16.S @@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 b L(itx_4x4_end) endfunc +// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers +// x0-x4 external parameters +// x5 function pointer to first transform +// x6 function pointer to second transform function inv_txfm_add_4x4_neon movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 - blr x4 + blr x5 st1 {v30.4s, v31.4s}, [x2], #32 sqxtn v16.4h, v16.4s @@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 - blr x5 + blr x6 ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 @@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon srshr v18.8h, v18.8h, #4 L(itx_4x4_end): - mvni v31.8h, #0xfc, lsl #8 // 0x3ff + dup v31.8h, w4 sub x0, x0, x1, lsl #2 usqadd v0.8h, v16.8h usqadd v1.8h, v18.8h @@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 b L(itx_4x4_end) 1: .endif - adr x4, inv_\txfm1\()_4s_x4_neon - movrel x5, X(inv_\txfm2\()_4h_x4_neon) + adr x5, inv_\txfm1\()_4s_x4_neon + movrel x6, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm @@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon sqrshrn2 v21.8h, v29.4s, #\shift sqrshrn2 v22.8h, v30.4s, #\shift sqrshrn2 v23.8h, v31.4s, #\shift +.if \scale + b L(horz_16x4_epilog) +.else +L(horz_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 @@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon .endr ret x14 +.endif endfunc .endm -def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale +def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 9f7b4e7a89..3df0393c3a 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -1154,7 +1154,7 @@ endfunc uxtl \r6\().8h, \r6\().8b .endif .endm -.macro mul_mla_4 d, s0, s1, s2, s3, wd +.macro mul_mla_4tap d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] @@ -1163,7 +1163,51 @@ endfunc // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. -.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().4h, \s1\().4h, v0.h[1] + mla \d0\().4h, \s2\().4h, v0.h[2] + mla \d0\().4h, \s3\().4h, v0.h[3] + mla \d0\().4h, \s4\().4h, v0.h[4] + mla \d0\().4h, \s5\().4h, v0.h[5] + mla \d0\().4h, \s6\().4h, v0.h[6] +.endm +.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s2\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[6] +.endm +.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mul \d1\().8h, \s3\().8h, v0.h[1] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d1\().8h, \s8\().8h, v0.h[6] +.endm +.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] @@ -1173,7 +1217,7 @@ endfunc mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm -.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 +.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1183,7 +1227,7 @@ endfunc mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm -.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1201,7 +1245,7 @@ endfunc mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm -.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 +.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] @@ -1315,11 +1359,11 @@ endfunc .endif .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_8bpc_neon, export=1 mov x8, \type_h mov x9, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1328,18 +1372,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps +function \type\()_\taps\()_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 @@ -1354,12 +1388,12 @@ function \type\()_8tap_neon tst \mx, #(0x7f << 14) sub w8, w8, #24 movrel x10, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1368,9 +1402,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x9, L(\type\()_8tap_h_tbl) + adr x9, L(\type\()_\taps\()_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1471,6 +1505,18 @@ L(\type\()_8tap_h): uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b +.ifc \taps, 6tap + ext v19.16b, v16.16b, v17.16b, #2 + ext v23.16b, v20.16b, v21.16b, #2 + mul v18.8h, v19.8h, v0.h[1] + mul v22.8h, v23.8h, v0.h[1] +.irpc i, 23456 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr +.else // 8tap mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 @@ -1479,6 +1525,7 @@ L(\type\()_8tap_h): mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr +.endif subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 @@ -1523,6 +1570,26 @@ L(\type\()_8tap_h): uxtl v22.8h, v22.8b 16: +.ifc \taps, 6tap + ext v28.16b, v16.16b, v17.16b, #2 + ext v29.16b, v17.16b, v18.16b, #2 + ext v30.16b, v20.16b, v21.16b, #2 + ext v31.16b, v21.16b, v22.16b, #2 + mul v24.8h, v28.8h, v0.h[1] + mul v25.8h, v29.8h, v0.h[1] + mul v26.8h, v30.8h, v0.h[1] + mul v27.8h, v31.8h, v0.h[1] +.irpc i, 23456 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] @@ -1537,6 +1604,7 @@ L(\type\()_8tap_h): mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 @@ -1575,18 +1643,18 @@ L(\type\()_8tap_h): b.gt 161b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1595,7 +1663,7 @@ L(\type\()_8tap_v): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_v_tbl) + adr x9, L(\type\()_\taps\()_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1620,7 +1688,7 @@ L(\type\()_8tap_v): interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .4h + mul_mla_4tap v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret @@ -1630,7 +1698,7 @@ L(\type\()_8tap_v): interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret @@ -1655,7 +1723,7 @@ L(\type\()_8tap_v): interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 - mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f @@ -1673,7 +1741,7 @@ L(\type\()_8tap_v): load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 - mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 + mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: @@ -1698,13 +1766,13 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 - mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 - mul_mla_4 v7, v3, v4, v5, v6, .8h + mul_mla_4tap v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret @@ -1729,28 +1797,28 @@ L(\type\()_8tap_v): load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 - mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 - mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 + mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 - mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 + mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 - mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: @@ -1773,14 +1841,14 @@ L(\type\()_8tap_v): load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 - mul_mla_4 v6, v1, v2, v3, v4, .8h - mul_mla_4 v7, v2, v3, v4, v5, .8h + mul_mla_4tap v6, v1, v2, v3, v4, .8h + mul_mla_4tap v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 - mul_mla_4 v1, v3, v4, v5, v6, .8h - mul_mla_4 v2, v4, v5, v6, v7, .8h + mul_mla_4tap v1, v3, v4, v5, v6, .8h + mul_mla_4tap v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret @@ -1809,32 +1877,32 @@ L(\type\()_8tap_v): subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 - mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 - mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 - mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 - mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 - mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 - mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: @@ -1882,10 +1950,10 @@ L(\type\()_8tap_v): uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b - mul_mla_4 v1, v16, v17, v18, v19, .8h - mul_mla_4 v16, v17, v18, v19, v20, .8h - mul_mla_4 v2, v23, v24, v25, v26, .8h - mul_mla_4 v17, v24, v25, v26, v27, .8h + mul_mla_4tap v1, v16, v17, v18, v19, .8h + mul_mla_4tap v16, v17, v18, v19, v20, .8h + mul_mla_4tap v2, v23, v24, v25, v26, .8h + mul_mla_4tap v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 @@ -1893,25 +1961,25 @@ L(\type\()_8tap_v): uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b - mul_mla_4 v1, v18, v19, v20, v21, .8h - mul_mla_4 v3, v19, v20, v21, v22, .8h - mul_mla_4 v2, v25, v26, v27, v28, .8h - mul_mla_4 v4, v26, v27, v28, v29, .8h + mul_mla_4tap v1, v18, v19, v20, v21, .8h + mul_mla_4tap v3, v19, v20, v21, v22, .8h + mul_mla_4tap v2, v25, v26, v27, v28, .8h + mul_mla_4tap v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f @@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_hv_tbl) + adr x9, L(\type\()_\taps\()_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 @@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv): addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv): addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v28.8b, #4 +.ifc \taps, 6tap + smull v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv): smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b @@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2): mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. @@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2): mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] +.ifc \taps, 6tap + srshr v18.4h, v31.4h, #2 +.else srshr v16.4h, v31.4h, #2 - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal v2.4s, v28.4h, v1.h[6] + smull v3.4s, v19.4h, v1.h[1] + smlal v3.4s, v20.4h, v1.h[2] + smlal v3.4s, v21.4h, v1.h[3] + smlal v3.4s, v22.4h, v1.h[4] + smlal v3.4s, v28.4h, v1.h[5] + smlal v3.4s, v29.4h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2): smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 @@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2): st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b @@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - bl L(\type\()_8tap_filter_8_first) - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8_first) +.ifc \taps, 6tap + mov v18.16b, v16.16b +.else + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v24.4h, v1.h[5] + smlal2 v5.4s, v24.8h, v1.h[5] + smlal v2.4s, v24.4h, v1.h[6] + smlal2 v3.4s, v24.8h, v1.h[6] + smlal v4.4s, v25.4h, v1.h[6] + smlal2 v5.4s, v25.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] +.endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv @@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4): st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2399,14 +2538,32 @@ L(\type\()_8tap_filter_4): .else add \dst, \dst, #16 .endif +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 +.endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8_first): +L(\type\()_\taps\()_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b +.ifc \taps, 6tap + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mul v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] +.else // 8tap mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) @@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first): mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] +.endif srshr v16.8h, v16.8h, #2 ret -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b +.ifc \taps, 6tap + ext v26.16b, v28.16b, v29.16b, #2 + ext v27.16b, v30.16b, v31.16b, #2 + mul v24.8h, v26.8h, v0.h[1] + mul v25.8h, v27.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr +.else // 8tap mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 @@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8): mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr +.endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my @@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 -filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap +filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 + .macro load_filter_row dst, src, inc asr w13, \src, #10 diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S index 1bfb12ebb3..576fab158a 100644 --- a/third_party/dav1d/src/arm/64/mc16.S +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -1374,19 +1374,35 @@ endfunc sub \r3\wd, \r3\wd, \c\wd .endif .endm -.macro smull_smlal_4 d, s0, s1, s2, s3 +.macro smull_smlal_4tap d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm -.macro smull2_smlal2_4 d, s0, s1, s2, s3 +.macro smull2_smlal2_4tap d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm -.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] + smlal \d\().4s, \s4\().4h, v0.h[4] + smlal \d\().4s, \s5\().4h, v0.h[5] + smlal \d\().4s, \s6\().4h, v0.h[6] +.endm +.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 + smull2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] + smlal2 \d\().4s, \s4\().8h, v0.h[4] + smlal2 \d\().4s, \s5\().8h, v0.h[5] + smlal2 \d\().4s, \s6\().8h, v0.h[6] +.endm +.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] @@ -1396,7 +1412,7 @@ endfunc smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm -.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 +.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] @@ -1499,11 +1515,11 @@ endfunc st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm -.macro make_8tap_fn op, type, type_h, type_v +.macro make_8tap_fn op, type, type_h, type_v, taps function \op\()_8tap_\type\()_16bpc_neon, export=1 mov w9, \type_h mov w10, \type_v - b \op\()_8tap_neon + b \op\()_\taps\()_neon endfunc .endm @@ -1512,18 +1528,8 @@ endfunc #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) -.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 -make_8tap_fn \type, regular, REGULAR, REGULAR -make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH -make_8tap_fn \type, regular_sharp, REGULAR, SHARP -make_8tap_fn \type, smooth, SMOOTH, SMOOTH -make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR -make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP -make_8tap_fn \type, sharp, SHARP, SHARP -make_8tap_fn \type, sharp_regular, SHARP, REGULAR -make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH - -function \type\()_8tap_neon +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps +function \type\()_\taps\()_neon .ifc \bdmax, w8 ldr w8, [sp] .endif @@ -1547,12 +1553,12 @@ function \type\()_8tap_neon add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 - b.ne L(\type\()_8tap_h) + b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) - b.ne L(\type\()_8tap_v) + b.ne L(\type\()_\taps\()_v) b \type\()_neon -L(\type\()_8tap_h): +L(\type\()_\taps\()_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f @@ -1561,9 +1567,9 @@ L(\type\()_8tap_h): 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 - b.ne L(\type\()_8tap_hv) + b.ne L(\type\()_\taps\()_hv) - adr x10, L(\type\()_8tap_h_tbl) + adr x10, L(\type\()_\taps\()_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1682,6 +1688,22 @@ L(\type\()_8tap_h): mov \mx, \w 8: +.ifc \taps, 6tap + ext v24.16b, v16.16b, v17.16b, #2 + ext v25.16b, v20.16b, v21.16b, #2 + smull v18.4s, v24.4h, v0.h[1] + smull2 v19.4s, v24.8h, v0.h[1] + smull v22.4s, v25.4h, v0.h[1] + smull2 v23.4s, v25.8h, v0.h[1] +.irpc i, 23456 + ext v24.16b, v16.16b, v17.16b, #(2*\i) + ext v25.16b, v20.16b, v21.16b, #(2*\i) + smlal v18.4s, v24.4h, v0.h[\i] + smlal2 v19.4s, v24.8h, v0.h[\i] + smlal v22.4s, v25.4h, v0.h[\i] + smlal2 v23.4s, v25.8h, v0.h[\i] +.endr +.else // 8tap smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] @@ -1694,6 +1716,7 @@ L(\type\()_8tap_h): smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr +.endif subs \mx, \mx, #8 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) @@ -1734,18 +1757,18 @@ L(\type\()_8tap_h): b.gt 81b ret -L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b +L(\type\()_\taps\()_h_tbl): + .hword L(\type\()_\taps\()_h_tbl) - 1280b + .hword L(\type\()_\taps\()_h_tbl) - 640b + .hword L(\type\()_\taps\()_h_tbl) - 320b + .hword L(\type\()_\taps\()_h_tbl) - 160b + .hword L(\type\()_\taps\()_h_tbl) - 80b + .hword L(\type\()_\taps\()_h_tbl) - 40b + .hword L(\type\()_\taps\()_h_tbl) - 20b .hword 0 -L(\type\()_8tap_v): +L(\type\()_\taps\()_v): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -1758,7 +1781,7 @@ L(\type\()_8tap_v): dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - adr x10, L(\type\()_8tap_v_tbl) + adr x10, L(\type\()_\taps\()_v_tbl) ldrh w9, [x10, x9, lsl #1] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -1785,7 +1808,7 @@ L(\type\()_8tap_v): load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f - smull_smlal_4 v6, v1, v2, v3, v4 + smull_smlal_4tap v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 @@ -1794,8 +1817,8 @@ L(\type\()_8tap_v): 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 - smull_smlal_4 v16, v1, v2, v3, v4 - smull_smlal_4 v17, v3, v4, v5, v6 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull_smlal_4tap v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 @@ -1817,8 +1840,8 @@ L(\type\()_8tap_v): subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 - smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 @@ -1836,7 +1859,7 @@ L(\type\()_8tap_v): 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 - smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 @@ -1860,13 +1883,13 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v6, v1, v2, v3, v4 - smull_smlal_4 v7, v2, v3, v4, v5 + smull_smlal_4tap v6, v1, v2, v3, v4 + smull_smlal_4tap v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v1, v3, v4, v5, v6 - smull_smlal_4 v2, v4, v5, v6, v7 + smull_smlal_4tap v1, v3, v4, v5, v6 + smull_smlal_4tap v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1885,10 +1908,10 @@ L(\type\()_8tap_v): 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 - smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 @@ -1903,8 +1926,8 @@ L(\type\()_8tap_v): b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret @@ -1925,17 +1948,17 @@ L(\type\()_8tap_v): sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 - smull_smlal_4 v16, v1, v2, v3, v4 - smull2_smlal2_4 v17, v1, v2, v3, v4 - smull_smlal_4 v18, v2, v3, v4, v5 - smull2_smlal2_4 v19, v2, v3, v4, v5 + smull_smlal_4tap v16, v1, v2, v3, v4 + smull2_smlal2_4tap v17, v1, v2, v3, v4 + smull_smlal_4tap v18, v2, v3, v4, v5 + smull2_smlal2_4tap v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 - smull_smlal_4 v16, v3, v4, v5, v6 - smull2_smlal2_4 v17, v3, v4, v5, v6 - smull_smlal_4 v18, v4, v5, v6, v7 - smull2_smlal2_4 v19, v4, v5, v6, v7 + smull_smlal_4tap v16, v3, v4, v5, v6 + smull2_smlal2_4tap v17, v3, v4, v5, v6 + smull_smlal_4tap v18, v4, v5, v6, v7 + smull2_smlal2_4tap v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret @@ -1962,18 +1985,18 @@ L(\type\()_8tap_v): 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 - smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 - smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 - smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 - smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 + smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 - smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 - smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 - smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 - smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 + smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 + smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b @@ -2013,10 +2036,10 @@ L(\type\()_8tap_v): 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 - smull_smlal_4 v1, v16, v18, v20, v22 - smull2_smlal2_4 v2, v16, v18, v20, v22 - smull_smlal_4 v3, v17, v19, v21, v23 - smull2_smlal2_4 v4, v17, v19, v21, v23 + smull_smlal_4tap v1, v16, v18, v20, v22 + smull2_smlal2_4tap v2, v16, v18, v20, v22 + smull_smlal_4tap v3, v17, v19, v21, v23 + smull2_smlal2_4tap v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b @@ -2029,17 +2052,17 @@ L(\type\()_8tap_v): 0: ret -L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b +L(\type\()_\taps\()_v_tbl): + .hword L(\type\()_\taps\()_v_tbl) - 1280b + .hword L(\type\()_\taps\()_v_tbl) - 640b + .hword L(\type\()_\taps\()_v_tbl) - 320b + .hword L(\type\()_\taps\()_v_tbl) - 160b + .hword L(\type\()_\taps\()_v_tbl) - 80b + .hword L(\type\()_\taps\()_v_tbl) - 40b + .hword L(\type\()_\taps\()_v_tbl) - 20b .hword 0 -L(\type\()_8tap_hv): +L(\type\()_\taps\()_hv): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f @@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv): 4: add \xmy, x11, \my, uxtw #3 - adr x10, L(\type\()_8tap_hv_tbl) + adr x10, L(\type\()_\taps\()_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) @@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv): addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores @@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv): mov v17.8b, v24.8b 2: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] @@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: - bl L(\type\()_8tap_filter_2) + bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v24.8b, #4 +.ifc \taps, 6tap + smull v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv): smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] +.endif srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s @@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv): 0: ret x15 -L(\type\()_8tap_filter_2): +L(\type\()_\taps\()_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 @@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2): // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2): 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 +.ifc \taps, 6tap + sub \sr2, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 +.else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd +.endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 @@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2): // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). +.ifc \taps, 6tap + xtn v18.4h, v16.4s +.else xtn v16.4h, v16.4s - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b - bl L(\type\()_8tap_filter_4) +.endif + bl L(\type\()_\taps\()_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: - bl L(\type\()_8tap_filter_4) + bl L(\type\()_\taps\()_filter_4) +.ifc \taps, 6tap + smull v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v19.4h, v1.h[2] + smlal v3.4s, v20.4h, v1.h[3] + smlal v3.4s, v21.4h, v1.h[4] + smlal v3.4s, v22.4h, v1.h[5] + smlal v3.4s, v24.4h, v1.h[6] + smull v4.4s, v19.4h, v1.h[1] + smlal v4.4s, v20.4h, v1.h[2] + smlal v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v22.4h, v1.h[4] + smlal v4.4s, v24.4h, v1.h[5] + smlal v4.4s, v25.4h, v1.h[6] +.else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] @@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2): smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] +.endif .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) @@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2): st1 {v3.d}[0], [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f +.ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b +.endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b @@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2): 0: ret x15 -L(\type\()_8tap_filter_4): +L(\type\()_\taps\()_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 @@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4): // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4): ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 +.ifc \taps, 8tap sub \src, \src, \s_strd +.endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b @@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4): lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v26.16b, v27.16b, v28.16b, #2 + smull v24.4s, v26.4h, v0.h[1] + smull2 v25.4s, v26.8h, v0.h[1] +.irpc i, 23456 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr +.else // 8tap smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 @@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4): smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr +.endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without @@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4): // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). +.ifc \taps, 6tap + uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 +.else uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b - bl L(\type\()_8tap_filter_8) +.endif + bl L(\type\()_\taps\()_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: +.ifc \taps, 6tap + smull v2.4s, v18.4h, v1.h[1] + smull2 v3.4s, v18.8h, v1.h[1] + bl L(\type\()_\taps\()_filter_8) + smull v4.4s, v19.4h, v1.h[1] + smull2 v5.4s, v19.8h, v1.h[1] + smlal v2.4s, v19.4h, v1.h[2] + smlal2 v3.4s, v19.8h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[2] + smlal2 v5.4s, v20.8h, v1.h[2] + smlal v2.4s, v20.4h, v1.h[3] + smlal2 v3.4s, v20.8h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[3] + smlal2 v5.4s, v21.8h, v1.h[3] + smlal v2.4s, v21.4h, v1.h[4] + smlal2 v3.4s, v21.8h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[4] + smlal2 v5.4s, v22.8h, v1.h[4] + smlal v2.4s, v22.4h, v1.h[5] + smlal2 v3.4s, v22.8h, v1.h[5] + smlal v4.4s, v23.4h, v1.h[5] + smlal2 v5.4s, v23.8h, v1.h[5] + smlal v2.4s, v23.4h, v1.h[6] + smlal2 v3.4s, v23.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] +.else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] - bl L(\type\()_8tap_filter_8) + bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] @@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4): smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] +.endif .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) @@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4): st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f +.ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b +.endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b @@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4): mov \h, \my add \src, \src, #16 add \dst, \dst, #16 +.ifc \taps, 6tap + add \src, \src, \s_strd, lsl #1 +.endif b 168b 0: ret x15 -L(\type\()_8tap_filter_8): +L(\type\()_\taps\()_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd +.ifc \taps, 6tap + ext v23.16b, v4.16b, v5.16b, #2 + ext v24.16b, v6.16b, v7.16b, #2 + smull v25.4s, v23.4h, v0.h[1] + smull2 v26.4s, v23.8h, v0.h[1] + smull v27.4s, v24.4h, v0.h[1] + smull2 v28.4s, v24.8h, v0.h[1] +.irpc i, 23456 + ext v23.16b, v4.16b, v5.16b, #(2*\i) + ext v24.16b, v6.16b, v7.16b, #(2*\i) + smlal v25.4s, v23.4h, v0.h[\i] + smlal2 v26.4s, v23.8h, v0.h[\i] + smlal v27.4s, v24.4h, v0.h[\i] + smlal2 v28.4s, v24.8h, v0.h[\i] +.endr +.else // 8tap smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] @@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8): smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr +.endif srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) @@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8): uzp1 v24.8h, v27.8h, v28.8h // Ditto ret -L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b +L(\type\()_\taps\()_hv_tbl): + .hword L(\type\()_\taps\()_hv_tbl) - 1280b + .hword L(\type\()_\taps\()_hv_tbl) - 640b + .hword L(\type\()_\taps\()_hv_tbl) - 320b + .hword L(\type\()_\taps\()_hv_tbl) - 160b + .hword L(\type\()_\taps\()_hv_tbl) - 80b + .hword L(\type\()_\taps\()_hv_tbl) - 40b + .hword L(\type\()_\taps\()_hv_tbl) - 20b .hword 0 endfunc +.endm +.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] @@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl): endfunc .endm -filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 -filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 +make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn put, sharp, SHARP, SHARP, 8tap +make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap + +make_8tap_fn put, regular, REGULAR, REGULAR, 6tap +make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap +filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 + +make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap +make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap +make_8tap_fn prep, sharp, SHARP, SHARP, 8tap +make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap +make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap + +make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap +make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap +make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap +make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap +filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 + .macro load_filter_row dst, src, inc asr w13, \src, #10 diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 3a6cf900a9..7bef9243fb 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -208,60 +208,66 @@ L(renorm): sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - mvn x7, x7 // ~dif - add x7, x7, x3, lsl #48 // ~dif + (v << 48) + sub x7, x7, x3, lsl #48 // dif - (v << 48) L(renorm2): lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d - lsl x7, x7, x5 // (~dif + (v << 48)) << d + lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] - mvn x7, x7 // ~dif - b.hs 9f + b.hs 4f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 - cmp x5, x4 - b.gt 2f - - ldr x3, [x3] // next_bits - add w8, w6, #23 // shift_bits = cnt + 23 - add w6, w6, #16 // cnt += 16 - rev x3, x3 // next_bits = bswap(next_bits) - sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 - and w8, w8, #24 // shift_bits &= 24 - lsr x3, x3, x8 // next_bits >>= shift_bits - sub w8, w8, w6 // shift_bits -= 16 + cnt - str x5, [x0, #BUF_POS] - lsl x3, x3, x8 // next_bits <<= shift_bits - mov w4, #48 - sub w6, w4, w8 // cnt = cnt + 64 - shift_bits - eor x7, x7, x3 // dif ^= next_bits - b 9f - -2: // refill_eob - mov w14, #40 - sub w5, w14, w6 // c = 40 - cnt -3: - cmp x3, x4 - b.ge 4f - ldrb w8, [x3], #1 - lsl x8, x8, x5 - eor x7, x7, x8 - subs w5, w5, #8 - b.ge 3b - -4: // refill_eob_end + subs x5, x5, x4 + b.hi 6f + + ldr x8, [x3] // next_bits + add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) + mvn x8, x8 + neg w5, w4 + rev x8, x8 // next_bits = bswap(next_bits) + lsr w5, w5, #3 // num_bytes_read + lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) + +2: // refill_end + add x3, x3, x5 + add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] - sub w6, w14, w5 // cnt = 40 - c -9: +3: // refill_end2 + orr x7, x7, x8 // dif |= next_bits + +4: // end str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 add sp, sp, #48 ret + +5: // pad_with_ones + add w8, w6, #-16 + ror x8, x8, x8 + b 3b + +6: // refill_eob + cmp x3, x4 + b.hs 5b + + ldr x8, [x4, #-8] + lsl w5, w5, #3 + lsr x8, x8, x5 + add w5, w6, #-48 + mvn x8, x8 + sub w4, w4, w3 // num_bytes_left + rev x8, x8 + lsr x8, x8, x5 + neg w5, w5 + lsr w5, w5, #3 + cmp w5, w4 + csel w5, w5, w4, lo // num_bytes_read + b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 @@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1 sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - mvn x7, x7 // ~dif - add x7, x7, x3, lsl #48 // ~dif + (v << 48) + sub x7, x7, x3, lsl #48 // dif - (v << 48) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d - lsl x7, x7, x5 // (~dif + (v << 48)) << d + lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] dup v3.4h, w4 - mvn x7, x7 // ~dif - b.hs 9f + b.hs 5f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 - cmp x5, x4 - b.gt 2f - - ldr x3, [x3] // next_bits - add w8, w6, #23 // shift_bits = cnt + 23 - add w6, w6, #16 // cnt += 16 - rev x3, x3 // next_bits = bswap(next_bits) - sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 - and w8, w8, #24 // shift_bits &= 24 - lsr x3, x3, x8 // next_bits >>= shift_bits - sub w8, w8, w6 // shift_bits -= 16 + cnt - str x5, [x0, #BUF_POS] - lsl x3, x3, x8 // next_bits <<= shift_bits - mov w4, #48 - sub w6, w4, w8 // cnt = cnt + 64 - shift_bits - eor x7, x7, x3 // dif ^= next_bits - b 9f - -2: // refill_eob - mov w14, #40 - sub w5, w14, w6 // c = 40 - cnt -3: - cmp x3, x4 - b.ge 4f - ldrb w8, [x3], #1 - lsl x8, x8, x5 - eor x7, x7, x8 - subs w5, w5, #8 - b.ge 3b - -4: // refill_eob_end + subs x5, x5, x4 + b.hi 7f + + ldr x8, [x3] // next_bits + add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) + mvn x8, x8 + neg w5, w4 + rev x8, x8 // next_bits = bswap(next_bits) + lsr w5, w5, #3 // num_bytes_read + lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) + +3: // refill_end + add x3, x3, x5 + add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] - sub w6, w14, w5 // cnt = 40 - c -9: +4: // refill_end2 + orr x7, x7, x8 // dif |= next_bits + +5: // end lsl w15, w15, #1 sub w15, w15, #5 lsr x12, x7, #48 @@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1 str x7, [x0, #DIF] lsr w0, w13, #1 ret + +6: // pad_with_ones + add w8, w6, #-16 + ror x8, x8, x8 + b 4b + +7: // refill_eob + cmp x3, x4 + b.hs 6b + + ldr x8, [x4, #-8] + lsl w5, w5, #3 + lsr x8, x8, x5 + add w5, w6, #-48 + mvn x8, x8 + sub w4, w4, w3 // num_bytes_left + rev x8, x8 + lsr x8, x8, x5 + neg w5, w5 + lsr w5, w5, #3 + cmp w5, w4 + csel w5, w5, w4, lo // num_bytes_read + b 3b endfunc function msac_decode_bool_equi_neon, export=1 @@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1 csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc @@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1 csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc @@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1 ldr w10, [x0, #ALLOW_UPDATE_CDF] clz w5, w4 // clz(rng) - mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 cbz w10, L(renorm2) diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S index 9013fd4b1e..1b3f319ce5 100644 --- a/third_party/dav1d/src/arm/64/util.S +++ b/third_party/dav1d/src/arm/64/util.S @@ -32,6 +32,10 @@ #include "config.h" #include "src/arm/asm.S" +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 @@ -51,6 +55,10 @@ adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif +#elif __has_feature(hwaddress_sanitizer) + adrp \rd, :pg_hi21_nc:\val+(\offset) + movk \rd, #:prel_g3:\val+0x100000000 + add \rd, \rd, :lo12:\val+(\offset) #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) @@ -149,6 +157,35 @@ trn2 \r7\().2d, \t9\().2d, \r7\().2d .endm +.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7 + trn1 \t8\().8h, \r0\().8h, \r1\().8h + trn2 \t9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \t9\().4s, \r3\().4s + trn2 \t9\().4s, \t9\().4s, \r3\().4s + trn1 \r3\().4s, \t8\().4s, \r1\().4s + trn2 \t8\().4s, \t8\().4s, \r1\().4s + + trn1 \o0\().2d, \r3\().2d, \r4\().2d + trn2 \o4\().2d, \r3\().2d, \r4\().2d + trn1 \o1\().2d, \r5\().2d, \r6\().2d + trn2 \o5\().2d, \r5\().2d, \r6\().2d + trn2 \o6\().2d, \t8\().2d, \r2\().2d + trn1 \o2\().2d, \t8\().2d, \r2\().2d + trn1 \o3\().2d, \t9\().2d, \r7\().2d + trn2 \o7\().2d, \t9\().2d, \r7\().2d +.endm + .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().16b, \r0\().16b, \r1\().16b trn2 \t9\().16b, \r0\().16b, \r1\().16b @@ -226,4 +263,16 @@ trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm +.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \o0\().4s, \t4\().4s, \t6\().4s + trn2 \o2\().4s, \t4\().4s, \t6\().4s + trn1 \o1\().4s, \t5\().4s, \t7\().4s + trn2 \o3\().4s, \t5\().4s, \t7\().4s +.endm + #endif /* DAV1D_SRC_ARM_64_UTIL_S */ diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S index dc50415f1f..fed73b3048 100644 --- a/third_party/dav1d/src/arm/asm.S +++ b/third_party/dav1d/src/arm/asm.S @@ -34,6 +34,50 @@ #define x18 do_not_use_x18 #define w18 do_not_use_w18 +#if HAVE_AS_ARCH_DIRECTIVE + .arch AS_ARCH_LEVEL +#endif + +#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE +#define ENABLE_DOTPROD .arch_extension dotprod +#define DISABLE_DOTPROD .arch_extension nodotprod +#else +#define ENABLE_DOTPROD +#define DISABLE_DOTPROD +#endif +#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE +#define ENABLE_I8MM .arch_extension i8mm +#define DISABLE_I8MM .arch_extension noi8mm +#else +#define ENABLE_I8MM +#define DISABLE_I8MM +#endif +#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE +#define ENABLE_SVE .arch_extension sve +#define DISABLE_SVE .arch_extension nosve +#else +#define ENABLE_SVE +#define DISABLE_SVE +#endif +#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE +#define ENABLE_SVE2 .arch_extension sve2 +#define DISABLE_SVE2 .arch_extension nosve2 +#else +#define ENABLE_SVE2 +#define DISABLE_SVE2 +#endif + +/* If we do support the .arch_extension directives, disable support for all + * the extensions that we may use, in case they were implicitly enabled by + * the .arch level. This makes it clear if we try to assemble an instruction + * from an unintended extension set; we only allow assmbling such instructions + * within regions where we explicitly enable those extensions. */ +DISABLE_DOTPROD +DISABLE_I8MM +DISABLE_SVE +DISABLE_SVE2 + + /* Support macros for * - Armv8.3-A Pointer Authentication and * - Armv8.5-A Branch Target Identification diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c index b7a0d3adbc..d9b1751a6a 100644 --- a/third_party/dav1d/src/arm/cpu.c +++ b/third_party/dav1d/src/arm/cpu.c @@ -31,22 +31,95 @@ #include "src/arm/cpu.h" -#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 -// NEON is always available; runtime tests are not needed. -#elif defined(HAVE_GETAUXVAL) && ARCH_ARM +#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) #include <sys/auxv.h> +#if ARCH_AARCH64 + +#define HWCAP_AARCH64_ASIMDDP (1 << 20) +#define HWCAP_AARCH64_SVE (1 << 22) +#define HWCAP2_AARCH64_SVE2 (1 << 1) +#define HWCAP2_AARCH64_I8MM (1 << 13) + +COLD unsigned dav1d_get_cpu_flags_arm(void) { +#ifdef HAVE_GETAUXVAL + unsigned long hw_cap = getauxval(AT_HWCAP); + unsigned long hw_cap2 = getauxval(AT_HWCAP2); +#else + unsigned long hw_cap = 0; + unsigned long hw_cap2 = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); + elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2)); +#endif + + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; + flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0; + flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0; + return flags; +} +#else /* !ARCH_AARCH64 */ + #ifndef HWCAP_ARM_NEON -#define HWCAP_ARM_NEON (1 << 12) +#define HWCAP_ARM_NEON (1 << 12) #endif -#define NEON_HWCAP HWCAP_ARM_NEON +#define HWCAP_ARM_ASIMDDP (1 << 24) +#define HWCAP_ARM_I8MM (1 << 27) -#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM -#include <sys/auxv.h> +COLD unsigned dav1d_get_cpu_flags_arm(void) { +#ifdef HAVE_GETAUXVAL + unsigned long hw_cap = getauxval(AT_HWCAP); +#else + unsigned long hw_cap = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); +#endif + + unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0; + return flags; +} +#endif /* ARCH_AARCH64 */ + +#elif defined(__APPLE__) +#include <sys/sysctl.h> + +static int have_feature(const char *feature) { + int supported = 0; + size_t size = sizeof(supported); + if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) { + return 0; + } + return supported; +} + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; + if (have_feature("hw.optional.arm.FEAT_DotProd")) + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; + if (have_feature("hw.optional.arm.FEAT_I8MM")) + flags |= DAV1D_ARM_CPU_FLAG_I8MM; + /* No SVE and SVE2 feature detection available on Apple platforms. */ + return flags; +} + +#elif defined(_WIN32) +#include <windows.h> -#define NEON_HWCAP HWCAP_NEON +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = DAV1D_ARM_CPU_FLAG_NEON; +#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; +#endif + /* No I8MM or SVE feature detection available on Windows at the time of + * writing. */ + return flags; +} #elif defined(__ANDROID__) +#include <ctype.h> #include <stdio.h> #include <string.h> @@ -58,18 +131,25 @@ static unsigned parse_proc_cpuinfo(const char *flag) { char line_buffer[120]; const char *line; + size_t flaglen = strlen(flag); while ((line = fgets(line_buffer, sizeof(line_buffer), file))) { - if (strstr(line, flag)) { - fclose(file); - return 1; + // check all occurances as whole words + const char *found = line; + while ((found = strstr(found, flag))) { + if ((found == line_buffer || !isgraph(found[-1])) && + (isspace(found[flaglen]) || feof(file))) { + fclose(file); + return 1; + } + found += flaglen; } // if line is incomplete seek back to avoid splitting the search // string into two buffers - if (!strchr(line, '\n') && strlen(line) > strlen(flag)) { + if (!strchr(line, '\n') && strlen(line) > flaglen) { // use fseek since the 64 bit fseeko is only available since // Android API level 24 and meson defines _FILE_OFFSET_BITS // by default 64 - if (fseek(file, -strlen(flag), SEEK_CUR)) + if (fseek(file, -flaglen, SEEK_CUR)) break; } } @@ -78,22 +158,23 @@ static unsigned parse_proc_cpuinfo(const char *flag) { return 0; } -#endif COLD unsigned dav1d_get_cpu_flags_arm(void) { - unsigned flags = 0; -#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 - flags |= DAV1D_ARM_CPU_FLAG_NEON; -#elif defined(HAVE_GETAUXVAL) && ARCH_ARM - unsigned long hw_cap = getauxval(AT_HWCAP); - flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM - unsigned long hw_cap = 0; - elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); - flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#elif defined(__ANDROID__) - flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; -#endif - + unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0; + flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0; + flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0; +#if ARCH_AARCH64 + flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0; + flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0; +#endif /* ARCH_AARCH64 */ return flags; } + +#else /* Unsupported OS */ + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + return 0; +} + +#endif diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h index 8c10a1b6b0..de9bde6ccf 100644 --- a/third_party/dav1d/src/arm/cpu.h +++ b/third_party/dav1d/src/arm/cpu.h @@ -30,6 +30,10 @@ enum CpuFlags { DAV1D_ARM_CPU_FLAG_NEON = 1 << 0, + DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1, + DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2, + DAV1D_ARM_CPU_FLAG_SVE = 1 << 3, + DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4, }; unsigned dav1d_get_cpu_flags_arm(void); diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h index 2ecd086b3b..17234e027a 100644 --- a/third_party/dav1d/src/arm/itx.h +++ b/third_party/dav1d/src/arm/itx.h @@ -117,9 +117,11 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon); + if (BITDEPTH == 16 && bpc != 10) return; - assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); assign_itx16_fn(R, 8, 4, neon); diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h index 9db0bf86ae..6eee0da424 100644 --- a/third_party/dav1d/src/arm/msac.h +++ b/third_party/dav1d/src/arm/msac.h @@ -39,7 +39,7 @@ unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); -#if ARCH_AARCH64 || defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h index c9009c7778..d20c5f0168 100644 --- a/third_party/dav1d/src/cpu.h +++ b/third_party/dav1d/src/cpu.h @@ -64,6 +64,20 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 flags |= DAV1D_ARM_CPU_FLAG_NEON; #endif +#ifdef __ARM_FEATURE_DOTPROD + flags |= DAV1D_ARM_CPU_FLAG_DOTPROD; +#endif +#ifdef __ARM_FEATURE_MATMUL_INT8 + flags |= DAV1D_ARM_CPU_FLAG_I8MM; +#endif +#if ARCH_AARCH64 +#ifdef __ARM_FEATURE_SVE + flags |= DAV1D_ARM_CPU_FLAG_SVE; +#endif +#ifdef __ARM_FEATURE_SVE2 + flags |= DAV1D_ARM_CPU_FLAG_SVE2; +#endif +#endif /* ARCH_AARCH64 */ #elif ARCH_PPC64LE #if defined(__VSX__) flags |= DAV1D_PPC_CPU_FLAG_VSX; diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm index 68b1f74f4b..d2bd758e67 100644 --- a/third_party/dav1d/src/ext/x86/x86inc.asm +++ b/third_party/dav1d/src/ext/x86/x86inc.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2022 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Henrik Gramner <henrik@gramner.com> @@ -104,7 +104,7 @@ %endif %define HAVE_PRIVATE_EXTERN 1 -%ifdef __NASM_VER__ +%ifdef __NASM_VERSION_ID__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 @@ -386,7 +386,24 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro -%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) +%macro RESET_STACK_STATE 0 + %ifidn rstk, rsp + %assign stack_offset stack_offset - stack_size_padded + %else + %xdefine rstk rsp + %endif + %assign stack_size 0 + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs + RESET_STACK_STATE + %ifnum %2 + %if mmsize != 8 + %assign xmm_regs_used %2 + %endif + %endif %ifnum %1 %if %1 != 0 %assign %%pad 0 @@ -396,11 +413,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers - %endif + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT @@ -496,35 +510,62 @@ DECLARE_REG 14, R13, 120 %endif %endmacro -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + high_mm_regs - movaps [rstk + stack_offset + 8], xmm6 - %endif - %if xmm_regs_used > 7 + high_mm_regs - movaps [rstk + stack_offset + 24], xmm7 - %endif - %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 - %if %%xmm_regs_on_stack > 0 - %assign %%i 8 - %rep %%xmm_regs_on_stack - movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i - %assign %%i %%i+1 - %endrep +; Push XMM registers to the stack. If no argument is specified all used register +; will be pushed, otherwise only push previously unpushed registers. +%macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed + %if mmsize != 8 + %if %0 == 2 + %assign %%pushed %2 + %assign xmm_regs_used %1 + %elif %0 == 1 + %assign %%pushed xmm_regs_used + %assign xmm_regs_used %1 + %else + %assign %%pushed 0 + %endif + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%pushed %%pushed - high_mm_regs - 8 + %if %%pushed < 0 + %assign %%pushed 0 + %endif + %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 + %if %%regs_to_push > 0 + ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 + %assign %%i %%pushed + 8 + %rep %%regs_to_push + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endif %endmacro -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 + high_mm_regs - %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 - %if %%xmm_regs_on_stack > 0 - ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad %%xmm_regs_on_stack*16 + 32 - %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) - SUB rsp, stack_size_padded +; Allocated stack space for XMM registers and push all, or a subset, of those +%macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved + RESET_STACK_STATE + %if mmsize != 8 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %if %0 == 2 + ASSERT %2 >= %1 + %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 + %else + %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 + %endif + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM %endif - WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 @@ -555,9 +596,7 @@ DECLARE_REG 14, R13, 120 %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL - %assign stack_offset (stack_offset-stack_size_padded) - %assign stack_size_padded 0 - %assign xmm_regs_used 0 + RESET_STACK_STATE %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs @@ -592,12 +631,11 @@ DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 - %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 + ALLOC_STACK %4, %3 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 @@ -661,7 +699,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 + ALLOC_STACK %4, %3 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 @@ -694,13 +732,19 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif ;====================================================================== %if WIN64 == 0 - %macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 + %macro WIN64_SPILL_XMM 1-2 + RESET_STACK_STATE + %if mmsize != 8 + %assign xmm_regs_used %1 + %endif %endmacro %macro WIN64_RESTORE_XMM 0 - %assign xmm_regs_used 0 + RESET_STACK_STATE %endmacro - %macro WIN64_PUSH_XMM 0 + %macro WIN64_PUSH_XMM 0-2 + %if mmsize != 8 && %0 >= 1 + %assign xmm_regs_used %1 + %endif %endmacro %endif @@ -845,9 +889,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %1: %2 %endmacro -; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. %if FORMAT_ELF + ; The GNU linker assumes the stack is executable by default. [SECTION .note.GNU-stack noalloc noexec nowrite progbits] + + %ifdef __NASM_VERSION_ID__ + %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 + %if ARCH_X86_64 + ; Control-flow Enforcement Technology (CET) properties. + [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] + dd 0x00000004 ; n_namesz + dd gprsize + 8 ; n_descsz + dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 + db "GNU",0 ; n_name + dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND + dd 0x00000004 ; pr_datasz + dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK + dd 0x00000000 ; pr_padding + %endif + %endif + %endif %endif ; Tell debuggers how large the function was. @@ -883,21 +944,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 -%assign cpuflags_gfni (1<<13) | cpuflags_sse42 -%assign cpuflags_avx (1<<14) | cpuflags_sse42 -%assign cpuflags_xop (1<<15) | cpuflags_avx -%assign cpuflags_fma4 (1<<16) | cpuflags_avx -%assign cpuflags_fma3 (1<<17) | cpuflags_avx -%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1 -%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2 -%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL -%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ - -%assign cpuflags_cache32 (1<<23) -%assign cpuflags_cache64 (1<<24) -%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<26) +%assign cpuflags_clmul (1<<13) | cpuflags_sse42 +%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul +%assign cpuflags_avx (1<<15) | cpuflags_sse42 +%assign cpuflags_xop (1<<16) | cpuflags_avx +%assign cpuflags_fma4 (1<<17) | cpuflags_avx +%assign cpuflags_fma3 (1<<18) | cpuflags_avx +%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 +%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL +%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ + +%assign cpuflags_cache32 (1<<24) +%assign cpuflags_cache64 (1<<25) +%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<27) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -939,13 +1001,13 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %if ARCH_X86_64 || cpuflag(sse2) - %ifdef __NASM_VER__ + %ifdef __NASM_VERSION_ID__ ALIGNMODE p6 %else CPU amdnop %endif %else - %ifdef __NASM_VER__ + %ifdef __NASM_VERSION_ID__ ALIGNMODE nop %else CPU basicnop @@ -1035,6 +1097,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif + %xdefine bcstw 1to8 %xdefine bcstd 1to4 %xdefine bcstq 1to2 %endmacro @@ -1050,6 +1113,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION + %xdefine bcstw 1to16 %xdefine bcstd 1to8 %xdefine bcstq 1to4 %endmacro @@ -1065,6 +1129,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION + %xdefine bcstw 1to32 %xdefine bcstd 1to16 %xdefine bcstq 1to8 %endmacro @@ -1607,11 +1672,11 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 -AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 -AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 -AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 -AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 -AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 +AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 +AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 +AVX_INSTR pclmulqdq, clmul, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 @@ -1766,6 +1831,7 @@ GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 +GPR_INSTR crc32, sse42 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 diff --git a/third_party/dav1d/src/itx_1d.c b/third_party/dav1d/src/itx_1d.c index ca14fc8c41..8f75c653af 100644 --- a/third_party/dav1d/src/itx_1d.c +++ b/third_party/dav1d/src/itx_1d.c @@ -1016,6 +1016,10 @@ void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride, c[stride * i] *= 4; } +#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ + ARCH_AARCH64 || \ + (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ +)) void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); const int in0 = c[0 * stride], in1 = c[1 * stride]; @@ -1032,3 +1036,4 @@ void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) { c[2 * stride] = t1; c[3 * stride] = t2 + t1; } +#endif diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c index 8ff245a0de..a226223c96 100644 --- a/third_party/dav1d/src/itx_tmpl.c +++ b/third_party/dav1d/src/itx_tmpl.c @@ -159,6 +159,10 @@ inv_txfm_fn64(64, 16, 2) inv_txfm_fn64(64, 32, 1) inv_txfm_fn64(64, 64, 2) +#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ + ARCH_AARCH64 || \ + (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ +)) static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, const int eob HIGHBD_DECL_SUFFIX) @@ -179,6 +183,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, for (int x = 0; x < 4; x++) dst[x] = iclip_pixel(dst[x] + *c++); } +#endif #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM @@ -236,7 +241,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \ inv_txfm_add_identity_adst_##w##x##h##_c; \ +#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ + ARCH_AARCH64 || \ + (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ +)) c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c; +#endif assign_itx_all_fn84( 4, 4, ); assign_itx_all_fn84( 4, 8, R); assign_itx_all_fn84( 4, 16, R); diff --git a/third_party/dav1d/src/loongarch/msac.S b/third_party/dav1d/src/loongarch/msac.S index c371eba4de..5bf18250a5 100644 --- a/third_party/dav1d/src/loongarch/msac.S +++ b/third_party/dav1d/src/loongarch/msac.S @@ -133,55 +133,58 @@ endconst slli.d t4, t4, 48 vpickve2gr.d t6, vr2, 0 sub.d t6, t6, t4 // dif - addi.d t6, t6, 1 clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 - addi.d t6, t6, -1 // dif addi.d a5, a0, 28 // cnt - ld.w t7, a5, 0 - sub.w t7, t7, t4 // cnt-d + ld.w t0, a5, 0 sll.w t5, t5, t4 + sub.w t7, t0, t4 // cnt-d st.w t5, a4, 0 // store rng - bge t7, zero, 9f + bgeu t0, t4, 9f // refill ld.d t0, a0, 0 // buf_pos - addi.d t1, a0, 8 - ld.d t1, t1, 0 // buf_end + ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 - blt t1, t2, 1f + bltu t1, t2, 2f - ld.d t0, t0, 0 // next_bits - addi.w t3, t7, 23 // shift_bits = cnt + 23 - addi.w t7, t7, 16 // cnt += 16 - revb.d t0, t0 // next_bits = bswap(next_bits) - srli.w t4, t3, 3 - sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 - st.d t2, a0, 0 - andi t3, t3, 24 // shift_bits &= 24 - srl.d t0, t0, t3 // next_bits >>= shift_bits - sub.w t3, t3, t7 // shift_bits -= 16 + cnt - sll.d t0, t0, t3 // next_bits <<= shift_bits - li.w t5, 48 - sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits - xor t6, t6, t0 // dif ^= next_bits - b 9f + ld.d t3, t0, 0 // next_bits + addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) + nor t3, t3, t3 + sub.w t2, zero, t1 + revb.d t3, t3 // next_bits = bswap(next_bits) + srli.w t2, t2, 3 // num_bytes_read + srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) + b 3f 1: - li.w t4, 40 - sub.w t5, t4, t7 // c = 40 - cnt + addi.w t3, t7, -48 + srl.d t3, t3, t3 // pad with ones + b 4f 2: - bge t0, t1, 3f - ld.bu t2, t0, 0 - addi.d t0, t0, 1 - sll.d t2, t2, t5 - xor t6, t6, t2 - addi.w t5, t5, -8 - bge t5, zero, 2b - // refill_eob_end + bgeu t0, t1, 1b + ld.d t3, t1, -8 // next_bits + sub.w t2, t2, t1 + sub.w t1, t1, t0 // num_bytes_left + slli.w t2, t2, 3 + srl.d t3, t3, t2 + addi.w t2, t7, -48 + nor t3, t3, t3 + sub.w t4, zero, t2 + revb.d t3, t3 + srli.w t4, t4, 3 + srl.d t3, t3, t2 + sltu t2, t1, t4 + maskeqz t1, t1, t2 + masknez t2, t4, t2 + or t2, t2, t1 // num_bytes_read 3: - st.d t0, a0, 0 // s->buf_pos = buf_pos - sub.w t7, t4, t5 // cnt = 40 - c + slli.w t1, t2, 3 + add.d t0, t0, t2 + add.w t7, t7, t1 // cnt += num_bits_read + st.d t0, a0, 0 +4: + or t6, t6, t3 // dif |= next_bits 9: st.w t7, a5, 0 // store cnt st.d t6, a6, 0 // store dif @@ -208,7 +211,6 @@ function msac_decode_bool_lsx srli.w t2, t0, 8 // r >> 8 mul.w t2, t2, a1 ld.w a5, a0, 28 // cnt - addi.d t1, t1, 1 // dif + 1 srli.w t2, t2, 1 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw @@ -226,49 +228,53 @@ function msac_decode_bool_lsx clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 - addi.d t6, t6, -1 // dif - sub.w t7, a5, t4 // cnt-d sll.w t5, t5, t4 + sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng - bge t7, zero, 9f + bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos - addi.d t1, a0, 8 - ld.d t1, t1, 0 // buf_end + ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 - blt t1, t2, 1f + bltu t1, t2, 2f - ld.d t0, t0, 0 // next_bits - addi.w t3, t7, 23 // shift_bits = cnt + 23 - addi.w t7, t7, 16 // cnt += 16 - revb.d t0, t0 // next_bits = bswap(next_bits) - srli.w t4, t3, 3 - sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 - st.d t2, a0, 0 - andi t3, t3, 24 // shift_bits &= 24 - srl.d t0, t0, t3 // next_bits >>= shift_bits - sub.w t3, t3, t7 // shift_bits -= 16 + cnt - sll.d t0, t0, t3 // next_bits <<= shift_bits - li.w t5, 48 - sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits - xor t6, t6, t0 // dif ^= next_bits - b 9f + ld.d t3, t0, 0 // next_bits + addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) + nor t3, t3, t3 + sub.w t2, zero, t1 + revb.d t3, t3 // next_bits = bswap(next_bits) + srli.w t2, t2, 3 // num_bytes_read + srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) + b 3f 1: - li.w t4, 40 - sub.w t5, t4, t7 // c = 40 - cnt + addi.w t3, t7, -48 + srl.d t3, t3, t3 // pad with ones + b 4f 2: - bge t0, t1, 3f - ld.bu t2, t0, 0 - addi.d t0, t0, 1 - sll.d t2, t2, t5 - xor t6, t6, t2 - addi.w t5, t5, -8 - bge t5, zero, 2b - // refill_eob_end + bgeu t0, t1, 1b + ld.d t3, t1, -8 // next_bits + sub.w t2, t2, t1 + sub.w t1, t1, t0 // num_bytes_left + slli.w t2, t2, 3 + srl.d t3, t3, t2 + addi.w t2, t7, -48 + nor t3, t3, t3 + sub.w t4, zero, t2 + revb.d t3, t3 + srli.w t4, t4, 3 + srl.d t3, t3, t2 + sltu t2, t1, t4 + maskeqz t1, t1, t2 + masknez t2, t4, t2 + or t2, t2, t1 // num_bytes_read 3: - st.d t0, a0, 0 // s->buf_pos = buf_pos - sub.w t7, t4, t5 // cnt = 40 - c + slli.w t1, t2, 3 + add.d t0, t0, t2 + add.w t7, t7, t1 // cnt += num_bits_read + st.d t0, a0, 0 +4: + or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif @@ -313,54 +319,56 @@ function msac_decode_bool_adapt_lsx st.h t0, a1, 2 .renorm: - // renorm - addi.d t6, t6, 1 clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 - addi.d t6, t6, -1 // dif - sub.w t7, a5, t4 // cnt-d sll.w t5, t5, t4 + sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng - bge t7, zero, 9f + bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos - addi.d t1, a0, 8 - ld.d t1, t1, 0 // buf_end + ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 - blt t1, t2, 1f + bltu t1, t2, 2f - ld.d t0, t0, 0 // next_bits - addi.w t3, t7, 23 // shift_bits = cnt + 23 - addi.w t7, t7, 16 // cnt += 16 - revb.d t0, t0 // next_bits = bswap(next_bits) - srli.w t4, t3, 3 - sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 - st.d t2, a0, 0 - andi t3, t3, 24 // shift_bits &= 24 - srl.d t0, t0, t3 // next_bits >>= shift_bits - sub.w t3, t3, t7 // shift_bits -= 16 + cnt - sll.d t0, t0, t3 // next_bits <<= shift_bits - li.w t5, 48 - sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits - xor t6, t6, t0 // dif ^= next_bits - b 9f + ld.d t3, t0, 0 // next_bits + addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) + nor t3, t3, t3 + sub.w t2, zero, t1 + revb.d t3, t3 // next_bits = bswap(next_bits) + srli.w t2, t2, 3 // num_bytes_read + srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) + b 3f 1: - li.w t4, 40 - sub.w t5, t4, t7 // c = 40 - cnt + addi.w t3, t7, -48 + srl.d t3, t3, t3 // pad with ones + b 4f 2: - bge t0, t1, 3f - ld.bu t2, t0, 0 - addi.d t0, t0, 1 - sll.d t2, t2, t5 - xor t6, t6, t2 - addi.w t5, t5, -8 - bge t5, zero, 2b - // refill_eob_end + bgeu t0, t1, 1b + ld.d t3, t1, -8 // next_bits + sub.w t2, t2, t1 + sub.w t1, t1, t0 // num_bytes_left + slli.w t2, t2, 3 + srl.d t3, t3, t2 + addi.w t2, t7, -48 + nor t3, t3, t3 + sub.w t4, zero, t2 + revb.d t3, t3 + srli.w t4, t4, 3 + srl.d t3, t3, t2 + sltu t2, t1, t4 + maskeqz t1, t1, t2 + masknez t2, t4, t2 + or t2, t2, t1 // num_bytes_read 3: - st.d t0, a0, 0 // s->buf_pos = buf_pos - sub.w t7, t4, t5 // cnt = 40 - c + slli.w t1, t2, 3 + add.d t0, t0, t2 + add.w t7, t7, t1 // cnt += num_bits_read + st.d t0, a0, 0 +4: + or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c index 43d8ae5d07..971ba85e29 100644 --- a/third_party/dav1d/src/msac.c +++ b/third_party/dav1d/src/msac.c @@ -43,15 +43,40 @@ static inline void ctx_refill(MsacContext *const s) { const uint8_t *buf_end = s->buf_end; int c = EC_WIN_SIZE - s->cnt - 24; ec_win dif = s->dif; - while (c >= 0 && buf_pos < buf_end) { - dif ^= ((ec_win)*buf_pos++) << c; + do { + if (buf_pos >= buf_end) { + // set remaining bits to 1; + dif |= ~(~(ec_win)0xff << c); + break; + } + dif |= (ec_win)(*buf_pos++ ^ 0xff) << c; c -= 8; - } + } while (c >= 0); s->dif = dif; s->cnt = EC_WIN_SIZE - c - 24; s->buf_pos = buf_pos; } +int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, + const int n, unsigned k) +{ + assert(n >> k == 8); + + unsigned a = 0; + if (dav1d_msac_decode_bool_equi(s)) { + if (dav1d_msac_decode_bool_equi(s)) + k += dav1d_msac_decode_bool_equi(s) + 1; + a = 1 << k; + } + const unsigned v = dav1d_msac_decode_bools(s, k) + a; + return ref * 2 <= n ? inv_recenter(ref, v) : + n - 1 - inv_recenter(n - 1 - ref, v); +} + +#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \ + ARCH_AARCH64 || \ + (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \ +)) /* Takes updated dif and range values, renormalizes them so that * 32768 <= rng < 65536 (reading more bytes from the stream into dif if * necessary), and stores them back in the decoder context. @@ -61,11 +86,13 @@ static inline void ctx_norm(MsacContext *const s, const ec_win dif, const unsigned rng) { const int d = 15 ^ (31 ^ clz(rng)); + const int cnt = s->cnt; assert(rng <= 65535U); - s->cnt -= d; - s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */ + s->dif = dif << d; s->rng = rng << d; - if (s->cnt < 0) + s->cnt = cnt - d; + // unsigned compare avoids redundant refills at eob + if ((unsigned)cnt < (unsigned)d) ctx_refill(s); } @@ -100,22 +127,6 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) { return !ret; } -int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, - const int n, unsigned k) -{ - assert(n >> k == 8); - - unsigned a = 0; - if (dav1d_msac_decode_bool_equi(s)) { - if (dav1d_msac_decode_bool_equi(s)) - k += dav1d_msac_decode_bool_equi(s) + 1; - a = 1 << k; - } - const unsigned v = dav1d_msac_decode_bools(s, k) + a; - return ref * 2 <= n ? inv_recenter(ref, v) : - n - 1 - inv_recenter(n - 1 - ref, v); -} - /* Decodes a symbol given an inverse cumulative distribution function (CDF) * table in Q15. */ unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s, @@ -188,13 +199,14 @@ unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) { } return tok; } +#endif void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, const size_t sz, const int disable_cdf_update_flag) { s->buf_pos = data; s->buf_end = data + sz; - s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1; + s->dif = 0; s->rng = 0x8000; s->cnt = -15; s->allow_update_cdf = !disable_cdf_update_flag; diff --git a/third_party/dav1d/src/ppc/cdef_tmpl.c b/third_party/dav1d/src/ppc/cdef_tmpl.c index e2e759810f..6ef87ad448 100644 --- a/third_party/dav1d/src/ppc/cdef_tmpl.c +++ b/third_party/dav1d/src/ppc/cdef_tmpl.c @@ -29,11 +29,10 @@ #if BITDEPTH == 8 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, - const int damping) + const uint16_t shift) { const i16x8 zero = vec_splat_s16(0); if (!threshold) return zero; - const uint16_t shift = imax(0, damping - ulog2(threshold)); const i16x8 abs_diff = vec_abs(diff); const b16x8 mask = vec_cmplt(diff, zero); const i16x8 thr = vec_splats(threshold); @@ -44,7 +43,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, return vec_sel(min, neg, mask); } -static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, +static inline void copy4xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, @@ -114,7 +113,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, } } -static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, +static inline void copy8xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, @@ -218,16 +217,12 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { #define LOAD_PIX(addr) \ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ - i16x8 max = px; \ - i16x8 min = px; \ i16x8 sum = vec_splat_s16(0); #define LOAD_PIX4(addr) \ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ - const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \ + const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \ const i16x8 px = vec_xxpermdi(a, b, 0); \ - i16x8 max = px; \ - i16x8 min = px; \ i16x8 sum = vec_splat_s16(0); #define LOAD_DIR(p, addr, o0, o1) \ @@ -238,22 +233,26 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { #define LOAD_DIR4(p, addr, o0, o1) \ LOAD_DIR(p ## a, addr, o0, o1) \ - LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \ + LOAD_DIR(p ## b, addr + 8, o0, o1) \ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); -#define CONSTRAIN(p, strength) \ +#define CONSTRAIN(p, strength, shift) \ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ \ - i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \ - i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \ - i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \ - i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping); + i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \ + i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \ + i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \ + i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift); + +#define SETUP_MINMAX \ + i16x8 max = px; \ + i16x8 min = px; \ #define MIN_MAX(p) \ max = max_mask(p ## 0, max); \ @@ -265,19 +264,16 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { max = max_mask(p ## 3, max); \ min = vec_min(p ## 3, min); -#define PRI_0(p) \ - p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \ - p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even))); +#define MAKE_TAPS \ + const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \ + const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \ + const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd)); -#define PRI_1(p) \ - p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \ - p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even))); - -#define SEC_0(p) \ - p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \ - p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \ - p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \ - p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1)); +#define PRI_0_UPDATE_SUM(p) \ + sum = vec_madd(tap0, p ## _c0, sum); \ + sum = vec_madd(tap0, p ## _c1, sum); \ + sum = vec_madd(tap1, p ## _c2, sum); \ + sum = vec_madd(tap1, p ## _c3, sum); #define UPDATE_SUM(p) \ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ @@ -285,92 +281,198 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { sum = vec_add(sum, p ## sum0); \ sum = vec_add(sum, p ## sum1); +#define SEC_0_UPDATE_SUM(p) \ + sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c3, sum); + +#define BIAS \ + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \ + bias = vec_sub(vec_splat_s16(8), bias); \ + +#define STORE4 \ + dst[0] = vdst[0]; \ + dst[1] = vdst[1]; \ + dst[2] = vdst[2]; \ + dst[3] = vdst[3]; \ +\ + tmp += 8; \ + dst += PXSTRIDE(dst_stride); \ + dst[0] = vdst[4]; \ + dst[1] = vdst[5]; \ + dst[2] = vdst[6]; \ + dst[3] = vdst[7]; \ +\ + tmp += 8; \ + dst += PXSTRIDE(dst_stride); + +#define STORE4_CLAMPED \ + BIAS \ + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ + STORE4 + +#define STORE4_UNCLAMPED \ + BIAS \ + i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + STORE4 + +#define STORE8 \ + dst[0] = vdst[0]; \ + dst[1] = vdst[1]; \ + dst[2] = vdst[2]; \ + dst[3] = vdst[3]; \ + dst[4] = vdst[4]; \ + dst[5] = vdst[5]; \ + dst[6] = vdst[6]; \ + dst[7] = vdst[7]; \ +\ + tmp += 16; \ + dst += PXSTRIDE(dst_stride); + +#define STORE8_CLAMPED \ + BIAS \ + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ + STORE8 + +#define STORE8_UNCLAMPED \ + BIAS \ + i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + STORE8 + +#define DIRECTIONS(w, tmp_stride) \ + static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \ + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \ + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \ + }; + +DIRECTIONS(4, 8) +DIRECTIONS(8, 16) + static inline void filter_4xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, - const int damping, const enum CdefEdgeFlags edges, - const ptrdiff_t tmp_stride, uint16_t *tmp) + const int pri_shift, const int sec_shift, + const enum CdefEdgeFlags edges, uint16_t *tmp) { - const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { - { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, - { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, - { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } - }; const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); - const int off1 = cdef_directions[dir][0]; - const int off1_1 = cdef_directions[dir][1]; + const int off1 = cdef_directions4[dir][0]; + const int off1_1 = cdef_directions4[dir][1]; - const int off2 = cdef_directions[(dir + 2) & 7][0]; - const int off3 = cdef_directions[(dir + 6) & 7][0]; + const int off2 = cdef_directions4[(dir + 2) & 7][0]; + const int off3 = cdef_directions4[(dir + 6) & 7][0]; - const int off2_1 = cdef_directions[(dir + 2) & 7][1]; - const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; - copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + MAKE_TAPS for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) + SETUP_MINMAX + // Primary pass LOAD_DIR4(p, tmp, off1, off1_1) - CONSTRAIN(p, pri_strength) + CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) - PRI_0(p) - PRI_1(p) - - UPDATE_SUM(p) + PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR4(s, tmp, off2, off3) - CONSTRAIN(s, sec_strength) + CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) - SEC_0(s) - - UPDATE_SUM(s) + SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR4(s2, tmp, off2_1, off3_1) - CONSTRAIN(s2, sec_strength) + CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store - i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); - bias = vec_sub(vec_splat_s16(8), bias); - i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); - i16x8 vdst = vec_max(vec_min(unclamped, max), min); - - dst[0] = vdst[0]; - dst[1] = vdst[1]; - dst[2] = vdst[2]; - dst[3] = vdst[3]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); - dst[0] = vdst[4]; - dst[1] = vdst[5]; - dst[2] = vdst[6]; - dst[3] = vdst[7]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); + STORE4_CLAMPED + } +} + +static inline void +filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int dir, + const int pri_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions4[dir][0]; + const int off1_1 = cdef_directions4[dir][1]; + + MAKE_TAPS + + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + + // Primary pass + LOAD_DIR4(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength, pri_shift) + + PRI_0_UPDATE_SUM(p) + + STORE4_UNCLAMPED + } +} + +static inline void +filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int sec_strength, const int dir, + const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int off2 = cdef_directions4[(dir + 2) & 7][0]; + const int off3 = cdef_directions4[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; + + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + // Secondary pass 1 + LOAD_DIR4(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength, sec_shift) + + SEC_0_UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR4(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength, sec_shift) + + UPDATE_SUM(s2) + + STORE4_UNCLAMPED } } @@ -379,88 +481,121 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, - const int damping, const enum CdefEdgeFlags edges, - const ptrdiff_t tmp_stride, uint16_t *tmp) + const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) { - const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { - { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, - { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, - { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } - }; const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions8[dir][0]; + const int off1_1 = cdef_directions8[dir][1]; - const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); - const int off1 = cdef_directions[dir][0]; - const int off1_1 = cdef_directions[dir][1]; + const int off2 = cdef_directions8[(dir + 2) & 7][0]; + const int off3 = cdef_directions8[(dir + 6) & 7][0]; - const int off2 = cdef_directions[(dir + 2) & 7][0]; - const int off3 = cdef_directions[(dir + 6) & 7][0]; + const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; - const int off2_1 = cdef_directions[(dir + 2) & 7][1]; - const int off3_1 = cdef_directions[(dir + 6) & 7][1]; - - copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + MAKE_TAPS for (int y = 0; y < h; y++) { LOAD_PIX(tmp) + SETUP_MINMAX + // Primary pass LOAD_DIR(p, tmp, off1, off1_1) - CONSTRAIN(p, pri_strength) + CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) - PRI_0(p) - PRI_1(p) - - UPDATE_SUM(p) + PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR(s, tmp, off2, off3) - CONSTRAIN(s, sec_strength) + CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) - SEC_0(s) - - UPDATE_SUM(s) + SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR(s2, tmp, off2_1, off3_1) - CONSTRAIN(s2, sec_strength) + CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store - i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); - bias = vec_sub(vec_splat_s16(8), bias); - i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); - i16x8 vdst = vec_max(vec_min(unclamped, max), min); - - dst[0] = vdst[0]; - dst[1] = vdst[1]; - dst[2] = vdst[2]; - dst[3] = vdst[3]; - dst[4] = vdst[4]; - dst[5] = vdst[5]; - dst[6] = vdst[6]; - dst[7] = vdst[7]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); + STORE8_CLAMPED + } + +} + +static inline void +filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int dir, + const int pri_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions8[dir][0]; + const int off1_1 = cdef_directions8[dir][1]; + + MAKE_TAPS + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Primary pass + LOAD_DIR(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength, pri_shift) + + PRI_0_UPDATE_SUM(p) + + STORE8_UNCLAMPED } +} + +static inline void +filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int sec_strength, const int dir, + const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int off2 = cdef_directions8[(dir + 2) & 7][0]; + const int off3 = cdef_directions8[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Secondary pass 1 + LOAD_DIR(s, tmp, off2, off3) + CONSTRAIN(s, sec_strength, sec_shift) + + SEC_0_UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength, sec_shift) + + UPDATE_SUM(s2) + + STORE8_UNCLAMPED + } } #define cdef_fn(w, h, tmp_stride) \ @@ -477,8 +612,22 @@ void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ - filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ - sec_strength, dir, damping, edges, tmp_stride, tmp); \ + copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \ + if (pri_strength) { \ + const int pri_shift = imax(0, damping - ulog2(pri_strength)); \ + if (sec_strength) { \ + const int sec_shift = damping - ulog2(sec_strength); \ + filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + sec_strength, dir, pri_shift, sec_shift, edges, tmp); \ + } else { \ + filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + dir, pri_shift, edges, tmp); \ + } \ + } else { \ + const int sec_shift = damping - ulog2(sec_strength); \ + filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \ + dir, sec_shift, edges, tmp); \ + } \ } cdef_fn(4, 4, 8); diff --git a/third_party/dav1d/src/riscv/64/itx.S b/third_party/dav1d/src/riscv/64/itx.S index 60d045150d..dfec548e40 100644 --- a/third_party/dav1d/src/riscv/64/itx.S +++ b/third_party/dav1d/src/riscv/64/itx.S @@ -163,48 +163,48 @@ endfunc vssub.vv \o3, v16, v20 .endm -.macro iadst_4 o0, o1, o2, o3 +.macro iadst_4 o0, o1, o2, o3, lm2, lm li t1, 1321 li t2, 3803 li t3, 2482 - vwmul.vx v4, v0, t1 - vwmul.vx v5, v0, t3 + vwmul.vx v16, v0, t1 + vwmul.vx v18, v0, t3 neg t1, t1 - vwmacc.vx v4, t2, v2 - vwmacc.vx v5, t1, v2 + vwmacc.vx v16, t2, v2 + vwmacc.vx v18, t1, v2 neg t2, t2 - vwmacc.vx v4, t3, v3 - vwmacc.vx v5, t2, v3 + vwmacc.vx v16, t3, v3 + vwmacc.vx v18, t2, v3 - vwsub.vv v6, v0, v2 - vwadd.wv v6, v6, v3 + vwsub.vv v20, v0, v2 + vwadd.wv v20, v20, v3 li t1, 3344 - vwmul.vx v7, v1, t1 + vwmul.vx v22, v1, t1 - vsetvli zero, zero, e32, m1, ta, ma + vsetvli zero, zero, e32, \lm2, ta, ma - vmul.vx v6, v6, t1 + vmul.vx v20, v20, t1 - vadd.vv v8, v4, v5 - vadd.vv v4, v4, v7 - vadd.vv v5, v5, v7 - vsub.vv v7, v8, v7 + vadd.vv v24, v16, v18 + vadd.vv v16, v16, v22 + vadd.vv v18, v18, v22 + vsub.vv v22, v24, v22 li t1, 2048 - vadd.vx v4, v4, t1 - vadd.vx v5, v5, t1 - vadd.vx v6, v6, t1 - vadd.vx v7, v7, t1 + vadd.vx v16, v16, t1 + vadd.vx v18, v18, t1 + vadd.vx v20, v20, t1 + vadd.vx v22, v22, t1 - vsetvli zero, zero, e16, mf2, ta, ma + vsetvli zero, zero, e16, \lm, ta, ma - vnsra.wi \o0, v4, 12 - vnsra.wi \o1, v5, 12 - vnsra.wi \o2, v6, 12 - vnsra.wi \o3, v7, 12 + vnsra.wi \o0, v16, 12 + vnsra.wi \o1, v18, 12 + vnsra.wi \o2, v20, 12 + vnsra.wi \o3, v22, 12 .endm function inv_dct_e16_x4_rvv, export=1, ext=v @@ -213,12 +213,22 @@ function inv_dct_e16_x4_rvv, export=1, ext=v endfunc function inv_adst_e16_x4_rvv, export=1, ext=v - iadst_4 v0, v1, v2, v3 + iadst_4 v0, v1, v2, v3, m1, mf2 jr t0 endfunc function inv_flipadst_e16_x4_rvv, export=1, ext=v - iadst_4 v3, v2, v1, v0 + iadst_4 v3, v2, v1, v0, m1, mf2 + jr t0 +endfunc + +function inv_adst_e16_x4w_rvv, export=1, ext=v + iadst_4 v0, v1, v2, v3, m2, m1 + jr t0 +endfunc + +function inv_flipadst_e16_x4w_rvv, export=1, ext=v + iadst_4 v3, v2, v1, v0, m2, m1 jr t0 endfunc @@ -328,6 +338,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v .ifc \variant, identity_ // The identity vsadd.vv and downshift vssra.vi 1 cancel out + + j L(itx_8x8_epilog) .else jalr t0, a4 @@ -339,8 +351,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v vssra.vi v5, v5, 1 vssra.vi v6, v6, 1 vssra.vi v7, v7, 1 -.endif +L(itx_8x8_epilog): vsseg8e16.v v0, (a2) vle16.v v0, (a2) addi t0, a2, 16 @@ -374,9 +386,7 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v vmv.v.x v8, zero vse16.v v8, (a2) -.ifc \variant, identity_ itx_8x8_end: -.endif vsetivli zero, 8, e8, mf2, ta, ma vle8.v v8, (a0) add t0, a0, a1 @@ -441,11 +451,12 @@ itx_8x8_end: vse8.v v15, (a0) ret +.endif endfunc .endm -def_fn_8x8_base def_fn_8x8_base identity_ +def_fn_8x8_base function inv_identity_e16_x8_rvv, export=1, ext=v vsadd.vv v0, v0, v0 @@ -530,23 +541,23 @@ endfunc li t5, 2598 li t6, 3166 - vwmul.vx v8, v7, t1 + vwmul.vx v16, v7, t1 neg t1, t1 - vwmul.vx v10, v7, t2 - vwmacc.vx v8, t2, v0 - vwmacc.vx v10, t1, v0 + vwmul.vx v18, v7, t2 + vwmacc.vx v16, t2, v0 + vwmacc.vx v18, t1, v0 - vwmul.vx v12, v5, t3 + vwmul.vx v20, v5, t3 neg t3, t3 - vwmul.vx v14, v5, t4 - vwmacc.vx v12, t4, v2 - vwmacc.vx v14, t3, v2 + vwmul.vx v22, v5, t4 + vwmacc.vx v20, t4, v2 + vwmacc.vx v22, t3, v2 - vwmul.vx v16, v3, t5 + vwmul.vx v24, v3, t5 neg t5, t5 - vwmul.vx v18, v3, t6 - vwmacc.vx v16, t6, v4 - vwmacc.vx v18, t5, v4 + vwmul.vx v26, v3, t6 + vwmacc.vx v24, t6, v4 + vwmacc.vx v26, t5, v4 li t1, 2048 li t2, 1189 @@ -555,95 +566,95 @@ endfunc li t5, 3784 li t6, 2896 - vwmul.vx v20, v1, t2 + vwmul.vx v28, v1, t2 neg t2, t2 - vwmul.vx v22, v1, t3 - vwmacc.vx v20, t3, v6 - vwmacc.vx v22, t2, v6 - - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 + vwmul.vx v30, v1, t3 + vwmacc.vx v28, t3, v6 + vwmacc.vx v30, t2, v6 + vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + vwadd.wx v26, v26, t1 + vwadd.wx v28, v28, t1 + vwadd.wx v30, v30, t1 - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 + vnsra.wi v24, v24, 12 + vnsra.wi v26, v26, 12 + vnsra.wi v28, v28, 12 + vnsra.wi v30, v30, 12 - vssub.vv v4, v8, v16 - vsadd.vv v8, v8, v16 - vsadd.vv v1, v10, v18 - vsadd.vv v2, v12, v20 - vsadd.vv v3, v14, v22 - vssub.vv v5, v10, v18 - vssub.vv v6, v12, v20 - vssub.vv v22, v14, v22 - - vsadd.vv \o0, v8, v2 - vsadd.vv \o7, v1, v3 - vssub.vv v2, v8, v2 - vssub.vv v3, v1, v3 - - vwmul.vx v8, v4, t5 - vwmul.vx v10, v4, t4 - vwmul.vx v12, v22, t5 - vwmul.vx v14, v22, t4 - vwmacc.vx v8, t4, v5 + vssub.vv v4, v16, v24 + vsadd.vv v16, v16, v24 + vsadd.vv v1, v18, v26 + vsadd.vv v2, v20, v28 + vsadd.vv v3, v22, v30 + vssub.vv v5, v18, v26 + vssub.vv v6, v20, v28 + vssub.vv v30, v22, v30 + + vsadd.vv \o0, v16, v2 + vsadd.vv \o7, v1, v3 + vssub.vv v2, v16, v2 + vssub.vv v3, v1, v3 + + vwmul.vx v16, v4, t5 + vwmul.vx v18, v4, t4 + vwmul.vx v20, v30, t5 + vwmul.vx v22, v30, t4 + vwmacc.vx v16, t4, v5 neg t4, t4 - vwmacc.vx v14, t5, v6 + vwmacc.vx v22, t5, v6 neg t5, t5 - vwmacc.vx v12, t4, v6 - vwmacc.vx v10, t5, v5 - - vwadd.wx v8, v8, t1 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 - - vnsra.wi v8, v8, 12 - vnsra.wi v10, v10, 12 - vnsra.wi v12, v12, 12 - vnsra.wi v14, v14, 12 - - vsadd.vv \o1, v8, v12 - vsadd.vv \o6, v10, v14 - vssub.vv v8, v8, v12 - vssub.vv v9, v10, v14 - - vwmul.vx v10, v2, t6 - vwmul.vx v12, v2, t6 - vwmul.vx v14, v8, t6 - vwmul.vx v16, v8, t6 - vwmacc.vx v10, t6, v3 - vwmacc.vx v14, t6, v9 - neg t6, t6 - vwmacc.vx v12, t6, v3 - vwmacc.vx v16, t6, v9 + vwmacc.vx v20, t4, v6 + vwmacc.vx v18, t5, v5 - vwadd.wx v10, v10, t1 - vwadd.wx v12, v12, t1 - vwadd.wx v14, v14, t1 vwadd.wx v16, v16, t1 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + + vnsra.wi v16, v16, 12 + vnsra.wi v18, v18, 12 + vnsra.wi v20, v20, 12 + vnsra.wi v22, v22, 12 - vnsra.wi \o3, v10, 12 - vnsra.wi \o4, v12, 12 - vnsra.wi \o2, v14, 12 - vnsra.wi \o5, v16, 12 + vsadd.vv \o1, v16, v20 + vsadd.vv \o6, v18, v22 + vssub.vv v16, v16, v20 + vssub.vv v17, v18, v22 + + vwmul.vx v18, v2, t6 + vwmul.vx v20, v2, t6 + vwmul.vx v22, v16, t6 + vwmul.vx v24, v16, t6 + vwmacc.vx v18, t6, v3 + vwmacc.vx v22, t6, v17 + neg t6, t6 + vwmacc.vx v20, t6, v3 + vwmacc.vx v24, t6, v17 - vmv.v.x v8, zero - vssub.vv \o1, v8, \o1 - vssub.vv \o3, v8, \o3 - vssub.vv \o5, v8, \o5 - vssub.vv \o7, v8, \o7 + vwadd.wx v18, v18, t1 + vwadd.wx v20, v20, t1 + vwadd.wx v22, v22, t1 + vwadd.wx v24, v24, t1 + + vnsra.wi \o3, v18, 12 + vnsra.wi \o4, v20, 12 + vnsra.wi \o2, v22, 12 + vnsra.wi \o5, v24, 12 + + vmv.v.x v16, zero + vssub.vv \o1, v16, \o1 + vssub.vv \o3, v16, \o3 + vssub.vv \o5, v16, \o5 + vssub.vv \o7, v16, \o7 .endm function inv_dct_e16_x8_rvv, export=1, ext=v @@ -714,6 +725,206 @@ def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst +function inv_txfm_add_4x8_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 8, e16, m1, ta, ma + vle16.v v0, (a2) + addi t0, a2, 16 + vle16.v v1, (t0) + addi t0, t0, 16 + vle16.v v2, (t0) + addi t0, t0, 16 + vle16.v v3, (t0) + + li t1, 2896*8 +.irp i, 0, 1, 2, 3 + vsmul.vx v\i, v\i, t1 +.endr + + jalr t0, a4 + + vsseg4e16.v v0, (a2) + + vsetivli zero, 4, e16, mf2, ta, ma + vmv.v.x v8, zero + vle16.v v0, (a2) + vse16.v v8, (a2) +.irp i, 1, 2, 3, 4, 5, 6, 7 + addi a2, a2, 8 + vle16.v v\i, (a2) + vse16.v v8, (a2) +.endr + + jalr t0, a5 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vssra.vi v\i, v\i, 4 +.endr + + vsetvli zero, zero, e8, mf4, ta, ma + vle8.v v8, (a0) + add t0, a0, a1 + vle8.v v9, (t0) +.irp i, 10, 11, 12, 13, 14, 15 + add t0, t0, a1 + vle8.v v\i, (t0) +.endr + + vwaddu.wv v0, v0, v8 + vwaddu.wv v1, v1, v9 + vwaddu.wv v2, v2, v10 + vwaddu.wv v3, v3, v11 + vwaddu.wv v4, v4, v12 + vwaddu.wv v5, v5, v13 + vwaddu.wv v6, v6, v14 + vwaddu.wv v7, v7, v15 + + vsetvli zero, zero, e16, mf2, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vmax.vx v\i, v\i, zero +.endr + + vsetvli zero, zero, e8, mf4, ta, ma + + vnclipu.wi v8, v0, 0 + vnclipu.wi v9, v1, 0 + vnclipu.wi v10, v2, 0 + vnclipu.wi v11, v3, 0 + vnclipu.wi v12, v4, 0 + vnclipu.wi v13, v5, 0 + vnclipu.wi v14, v6, 0 + vnclipu.wi v15, v7, 0 + + vse8.v v8, (a0) +.irp i, 9, 10, 11, 12, 13, 14, 15 + add a0, a0, a1 + vse8.v v\i, (a0) +.endr + + ret +endfunc + +function inv_txfm_add_8x4_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 4, e16, mf2, ta, ma + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) +.irp i, 2, 3, 4, 5, 6, 7 + addi t0, t0, 8 + vle16.v v\i, (t0) +.endr + + li t1, 2896*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vsmul.vx v\i, v\i, t1 +.endr + + jalr t0, a4 + + vsseg8e16.v v0, (a2) + + vsetivli zero, 8, e16, m1, ta, ma + vmv.v.x v4, zero + vle16.v v0, (a2) + vse16.v v4, (a2) +.irp i, 1, 2, 3 + addi a2, a2, 16 + vle16.v v\i, (a2) + vse16.v v4, (a2) +.endr + + jalr t0, a5 + + vssra.vi v0, v0, 4 + vssra.vi v1, v1, 4 + vssra.vi v2, v2, 4 + vssra.vi v3, v3, 4 + + vsetvli zero, zero, e8, mf2, ta, ma + vle8.v v4, (a0) + add t0, a0, a1 + vle8.v v5, (t0) + add t0, t0, a1 + vle8.v v6, (t0) + add t0, t0, a1 + vle8.v v7, (t0) + + vwaddu.wv v0, v0, v4 + vwaddu.wv v1, v1, v5 + vwaddu.wv v2, v2, v6 + vwaddu.wv v3, v3, v7 + + vsetvli zero, zero, e16, m1, ta, ma + vmax.vx v0, v0, zero + vmax.vx v1, v1, zero + vmax.vx v2, v2, zero + vmax.vx v3, v3, zero + + vsetvli zero, zero, e8, mf2, ta, ma + + vnclipu.wi v4, v0, 0 + vnclipu.wi v5, v1, 0 + vnclipu.wi v6, v2, 0 + vnclipu.wi v7, v3, 0 + + vse8.v v4, (a0) + add a0, a0, a1 + vse8.v v5, (a0) + add a0, a0, a1 + vse8.v v6, (a0) + add a0, a0, a1 + vse8.v v7, (a0) + + ret +endfunc + +/* Define symbols added in .if statement */ +.equ dct, 1 +.equ identity, 2 +.equ adst, 3 +.equ flipadst, 4 + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 +.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) + la a4, inv_\txfm1\()_e16_x\w\()w_rvv +.else + la a4, inv_\txfm1\()_e16_x\w\()_rvv +.endif +.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) + la a5, inv_\txfm2\()_e16_x\h\()w_rvv +.else + la a5, inv_\txfm2\()_e16_x\h\()_rvv +.endif + j inv_txfm_add_\w\()x\h\()_rvv +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + function inv_identity_e16_x16_rvv, export=1, ext=v li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 @@ -1196,10 +1407,12 @@ endfunc .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero -.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - vle16.v v\i, (t4) + vle16.v v0, (t4) vse16.v v16, (t4) +.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 + vle16.v v\i, (t4) + vse16.v v16, (t4) .endr .ifc \variant, _identity li t1, 2*(5793-4096)*8 @@ -1208,29 +1421,35 @@ function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vsra.vi v16, v16, 1 vaadd.vv v\i, v\i, v16 .endr + j L(horz_16x8_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 2 .endr -.endif -.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - vsse16.v v\i, (t5), t6 +L(horz_16x8_epilog): + vsse16.v v0, (t5), t6 +.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t5, t5, 2 + vsse16.v v\i, (t5), t6 .endr jr a7 +.endif endfunc .endm -def_horz_16 def_horz_16 _identity +def_horz_16 function inv_txfm_add_vert_8x16_rvv, export=1, ext=v vsetivli zero, 8, e16, m1, ta, ma -.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - vle16.v v\i, (t4) + + vle16.v v0, (t4) +.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 + vle16.v v\i, (t4) .endr + jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 @@ -1238,10 +1457,13 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v .endr vsetivli zero, 8, e8, mf2, ta, ma - mv t0, t5 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - vle8.v v\i, (t0) + + vle8.v v16, (t5) + add t0, t5, a1 + vle8.v v17, (t0) +.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t0, t0, a1 + vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v16 @@ -1284,9 +1506,10 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v vnclipu.wi v30, v14, 0 vnclipu.wi v31, v15, 0 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - vse8.v v\i, (t5) + vse8.v v16, (t5) +.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t5, t5, a1 + vse8.v v\i, (t5) .endr jr a7 @@ -1296,11 +1519,26 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma addi sp, sp, -16*32 -.irp i, 0, 8 +.irp i, 8, 0 addi t4, a2, \i*2 addi t5, sp, \i*16*2 +.if \i == 8 + blt a3, a7, 1f +.endif li t6, 16*2 jalr a7, a6 +.if \i == 8 + j 2f +1: + li t1, 64 + vsetvli zero, t1, e16, m8, ta, ma + vmv.v.x v0, zero + vse16.v v0, (t5) + addi t5, t5, 128 + vse16.v v0, (t5) + vsetivli zero, 8, e16, m1, ta, ma +2: +.endif .endr .irp i, 0, 8 addi t4, sp, \i*2 @@ -1312,7 +1550,7 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v ret endfunc -.macro def_fn_16x16 txfm1, txfm2 +.macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v .ifc \txfm1, identity la a6, inv_txfm_horz_identity_16x8_rvv @@ -1321,19 +1559,558 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v la a4, inv_\txfm1\()_e16_x16_rvv .endif la a5, inv_\txfm2\()_e16_x16_rvv + li a7, \eob_half j inv_txfm_add_16x16_rvv endfunc .endm -def_fn_16x16 dct, dct -def_fn_16x16 identity, identity -def_fn_16x16 dct, adst -def_fn_16x16 dct, flipadst -def_fn_16x16 dct, identity -def_fn_16x16 adst, dct -def_fn_16x16 adst, adst -def_fn_16x16 adst, flipadst -def_fn_16x16 flipadst, dct -def_fn_16x16 flipadst, adst -def_fn_16x16 flipadst, flipadst -def_fn_16x16 identity, dct +def_fn_16x16 dct, dct, 36 +def_fn_16x16 identity, identity, 36 +def_fn_16x16 dct, adst, 36 +def_fn_16x16 dct, flipadst, 36 +def_fn_16x16 dct, identity, 8 +def_fn_16x16 adst, dct, 36 +def_fn_16x16 adst, adst, 36 +def_fn_16x16 adst, flipadst, 36 +def_fn_16x16 flipadst, dct, 36 +def_fn_16x16 flipadst, adst, 36 +def_fn_16x16 flipadst, flipadst, 36 +def_fn_16x16 identity, dct, 8 + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 8, e16, m1, ta, ma + + blt a3, a6, 1f + + addi t0, a2, 16 + vle16.v v0, (t0) + addi t0, t0, 32 + vle16.v v1, (t0) + addi t0, t0, 32 + vle16.v v2, (t0) + addi t0, t0, 32 + vle16.v v3, (t0) + +.ifc \variant, identity_ + li t1, (5793-4096)*8 + vsmul.vx v8, v0, t1 + vaadd.vv v4, v0, v8 + vsmul.vx v8, v1, t1 + vaadd.vv v5, v1, v8 + vsmul.vx v8, v2, t1 + vaadd.vv v6, v2, v8 + vsmul.vx v8, v3, t1 + vaadd.vv v7, v3, v8 +.else + jalr t0, a4 + + vssra.vi v4, v0, 1 + vssra.vi v5, v1, 1 + vssra.vi v6, v2, 1 + vssra.vi v7, v3, 1 +.endif + + j 2f + +1: +.irp i, 4, 5, 6, 7 + vmv.v.x v\i, zero +.endr + +2: + vle16.v v0, (a2) + addi t0, a2, 32 + vle16.v v1, (t0) + addi t0, t0, 32 + vle16.v v2, (t0) + addi t0, t0, 32 + vle16.v v3, (t0) + +.ifc \variant, identity_ + li t1, (5793-4096)*8 +.irp i, 0, 1, 2, 3 + vsmul.vx v8, v\i, t1 + vaadd.vv v\i, v\i, v8 +.endr + + j L(itx_4x16_epilog) +.else + jalr t0, a4 + + vssra.vi v0, v0, 1 + vssra.vi v1, v1, 1 + vssra.vi v2, v2, 1 + vssra.vi v3, v3, 1 + +L(itx_4x16_epilog): + vsseg4e16.v v0, (a2) + addi t0, a2, 64 + vsseg4e16.v v4, (t0) + + vsetivli zero, 4, e16, mf2, ta, ma + + vmv.v.x v16, zero + vle16.v v0, (a2) + vse16.v v16, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) + vse16.v v16, (t0) +.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + addi t0, t0, 8 + vle16.v v\i, (t0) + vse16.v v16, (t0) +.endr + + jalr t0, a5 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 4 +.endr + + vsetvli zero, zero, e8, mf4, ta, ma + + vle8.v v16, (a0) + add t0, a0, a1 + vle8.v v17, (t0) +.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + add t0, t0, a1 + vle8.v v\i, (t0) +.endr + + vwaddu.wv v0, v0, v16 + vwaddu.wv v1, v1, v17 + vwaddu.wv v2, v2, v18 + vwaddu.wv v3, v3, v19 + vwaddu.wv v4, v4, v20 + vwaddu.wv v5, v5, v21 + vwaddu.wv v6, v6, v22 + vwaddu.wv v7, v7, v23 + vwaddu.wv v8, v8, v24 + vwaddu.wv v9, v9, v25 + vwaddu.wv v10, v10, v26 + vwaddu.wv v11, v11, v27 + vwaddu.wv v12, v12, v28 + vwaddu.wv v13, v13, v29 + vwaddu.wv v14, v14, v30 + vwaddu.wv v15, v15, v31 + + vsetvli zero, zero, e16, mf2, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vmax.vx v\i, v\i, zero +.endr + + vsetvli zero, zero, e8, mf4, ta, ma + + vnclipu.wi v16, v0, 0 + vnclipu.wi v17, v1, 0 + vnclipu.wi v18, v2, 0 + vnclipu.wi v19, v3, 0 + vnclipu.wi v20, v4, 0 + vnclipu.wi v21, v5, 0 + vnclipu.wi v22, v6, 0 + vnclipu.wi v23, v7, 0 + vnclipu.wi v24, v8, 0 + vnclipu.wi v25, v9, 0 + vnclipu.wi v26, v10, 0 + vnclipu.wi v27, v11, 0 + vnclipu.wi v28, v12, 0 + vnclipu.wi v29, v13, 0 + vnclipu.wi v30, v14, 0 + vnclipu.wi v31, v15, 0 + + vse8.v v16, (a0) +.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + add a0, a0, a1 + vse8.v v\i, (a0) +.endr + + ret +.endif +endfunc + +function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 4, e16, mf2, ta, ma + vle16.v v0, (a2) + addi t0, a2, 8 + vle16.v v1, (t0) +.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + addi t0, t0, 8 + vle16.v v\i, (t0) +.endr + +.ifc \variant, identity_ + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vssra.vi v16, v16, 1 + vsadd.vv v\i, v\i, v16 +.endr + + j L(itx_16x4_epilog) +.else + jalr t0, a4 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 1 +.endr + +L(itx_16x4_epilog): + li t0, 32 + vssseg8e16.v v0, (a2), t0 + addi t1, a2, 16 + vssseg8e16.v v8, (t1), t0 + +.irp j, 0, 8 + vsetivli zero, 8, e16, m1, ta, ma + + vmv.v.x v4, zero + addi t0, a2, \j*2 + vle16.v v0, (t0) + vse16.v v4, (t0) +.irp i, 1, 2, 3 + addi t0, t0, 32 + vle16.v v\i, (t0) + vse16.v v4, (t0) +.endr + + jalr t0, a5 + + vssra.vi v0, v0, 4 + vssra.vi v1, v1, 4 + vssra.vi v2, v2, 4 + vssra.vi v3, v3, 4 + + vsetvli zero, zero, e8, mf2, ta, ma + addi t0, a0, \j + vle8.v v4, (t0) + add t0, t0, a1 + vle8.v v5, (t0) + add t0, t0, a1 + vle8.v v6, (t0) + add t0, t0, a1 + vle8.v v7, (t0) + + vwaddu.wv v0, v0, v4 + vwaddu.wv v1, v1, v5 + vwaddu.wv v2, v2, v6 + vwaddu.wv v3, v3, v7 + + vsetvli zero, zero, e16, m1, ta, ma + vmax.vx v0, v0, zero + vmax.vx v1, v1, zero + vmax.vx v2, v2, zero + vmax.vx v3, v3, zero + + vsetvli zero, zero, e8, mf2, ta, ma + + vnclipu.wi v4, v0, 0 + vnclipu.wi v5, v1, 0 + vnclipu.wi v6, v2, 0 + vnclipu.wi v7, v3, 0 + + addi t0, a0, \j + vse8.v v4, (t0) + add t0, t0, a1 + vse8.v v5, (t0) + add t0, t0, a1 + vse8.v v6, (t0) + add t0, t0, a1 + vse8.v v7, (t0) +.endr + + ret +.endif +endfunc +.endm + +def_fn_416_base identity_ +def_fn_416_base + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 +.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) + la a4, inv_\txfm1\()_e16_x\w\()w_rvv +.elseif \txfm1 != identity + la a4, inv_\txfm1\()_e16_x\w\()_rvv +.endif +.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) + la a5, inv_\txfm2\()_e16_x\h\()w_rvv +.else + la a5, inv_\txfm2\()_e16_x\h\()_rvv +.endif +.if \w == 4 + li a6, \eob_half +.endif +.ifc \txfm1, identity + j inv_txfm_identity_add_\w\()x\h\()_rvv +.else + j inv_txfm_add_\w\()x\h\()_rvv +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 8, e16, m1, ta, ma + + blt a3, a6, 1f + + vmv.v.x v16, zero + addi t0, a2, 16 + vle16.v v0, (t0) + vse16.v v16, (t0) +.irp i, 1, 2, 3, 4, 5, 6, 7 + addi t0, t0, 32 + vle16.v v\i, (t0) + vse16.v v16, (t0) +.endr + + li t1, 2896*8 +.ifc \variant, identity_ + vsmul.vx v8, v0, t1 + vsmul.vx v9, v1, t1 + vsmul.vx v10, v2, t1 + vsmul.vx v11, v3, t1 + vsmul.vx v12, v4, t1 + vsmul.vx v13, v5, t1 + vsmul.vx v14, v6, t1 + vsmul.vx v15, v7, t1 +.else +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vsmul.vx v\i, v\i, t1 +.endr + + jalr t0, a4 + + vssra.vi v8, v0, 1 + vssra.vi v9, v1, 1 + vssra.vi v10, v2, 1 + vssra.vi v11, v3, 1 + vssra.vi v12, v4, 1 + vssra.vi v13, v5, 1 + vssra.vi v14, v6, 1 + vssra.vi v15, v7, 1 +.endif + + j 2f + +1: +.irp i, 8, 9, 10, 11, 12, 13, 14, 15 + vmv.v.x v\i, zero +.endr + +2: + vmv.v.x v16, zero + vle16.v v0, (a2) + vse16.v v16, (a2) + addi t0, a2, 32 + vle16.v v1, (t0) + vse16.v v16, (t0) +.irp i, 2, 3, 4, 5, 6, 7 + addi t0, t0, 32 + vle16.v v\i, (t0) + vse16.v v16, (t0) +.endr + + li t1, 2896*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vsmul.vx v\i, v\i, t1 +.endr + +.ifc \variant, identity_ + j L(itx_8x16_epilog) +.else + jalr t0, a4 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vssra.vi v\i, v\i, 1 +.endr + +L(itx_8x16_epilog): + addi t4, sp, -8*32 + vsseg8e16.v v0, (t4) + addi t0, t4, 8*16 + vsseg8e16.v v8, (t0) + + mv t5, a0 + li t6, 16 + jal a7, inv_txfm_add_vert_8x16_rvv + + ret +.endif +endfunc + +function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v + csrw vxrm, zero + + vsetivli zero, 8, e16, m1, ta, ma + vle16.v v0, (a2) + addi t0, a2, 16 + vle16.v v1, (t0) +.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + addi t0, t0, 16 + vle16.v v\i, (t0) +.endr + + li t1, 2896*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v\i, v\i, t1 +.endr + +.ifc \variant, identity_ + li t1, 2*(5793-4096)*8 +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vsmul.vx v16, v\i, t1 + vssra.vi v16, v16, 1 + vsadd.vv v\i, v\i, v16 +.endr + + j L(itx_16x8_epilog) +.else + jalr t0, a4 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + vssra.vi v\i, v\i, 1 +.endr + +L(itx_16x8_epilog): + li t0, 32 + vssseg8e16.v v0, (a2), t0 + addi t1, a2, 16 + vssseg8e16.v v8, (t1), t0 + +.irp j, 0, 8 + vsetivli zero, 8, e16, m1, ta, ma + + vmv.v.x v8, zero + addi t0, a2, \j*2 + vle16.v v0, (t0) + vse16.v v8, (t0) +.irp i, 1, 2, 3, 4, 5, 6, 7 + addi t0, t0, 32 + vle16.v v\i, (t0) + vse16.v v8, (t0) +.endr + + jalr t0, a5 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vssra.vi v\i, v\i, 4 +.endr + + vsetvli zero, zero, e8, mf2, ta, ma + addi t0, a0, \j + vle8.v v8, (t0) +.irp i, 9, 10, 11, 12, 13, 14, 15 + add t0, t0, a1 + vle8.v v\i, (t0) +.endr + + vwaddu.wv v0, v0, v8 + vwaddu.wv v1, v1, v9 + vwaddu.wv v2, v2, v10 + vwaddu.wv v3, v3, v11 + vwaddu.wv v4, v4, v12 + vwaddu.wv v5, v5, v13 + vwaddu.wv v6, v6, v14 + vwaddu.wv v7, v7, v15 + + vsetvli zero, zero, e16, m1, ta, ma +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + vmax.vx v\i, v\i, zero +.endr + + vsetvli zero, zero, e8, mf2, ta, ma + + vnclipu.wi v8, v0, 0 + vnclipu.wi v9, v1, 0 + vnclipu.wi v10, v2, 0 + vnclipu.wi v11, v3, 0 + vnclipu.wi v12, v4, 0 + vnclipu.wi v13, v5, 0 + vnclipu.wi v14, v6, 0 + vnclipu.wi v15, v7, 0 + + addi t0, a0, \j + vse8.v v8, (t0) +.irp i, 9, 10, 11, 12, 13, 14, 15 + add t0, t0, a1 + vse8.v v\i, (t0) +.endr +.endr + + ret +.endif +endfunc +.endm + +def_fn_816_base identity_ +def_fn_816_base + +.macro def_fn_816 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 +.ifnc \txfm1, identity + la a4, inv_\txfm1\()_e16_x\w\()_rvv +.endif + la a5, inv_\txfm2\()_e16_x\h\()_rvv +.if \w == 8 + li a6, \eob_half +.endif +.ifc \txfm1, identity + j inv_txfm_identity_add_\w\()x\h\()_rvv +.else + j inv_txfm_add_\w\()x\h\()_rvv +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43 +def_fn_816 \w, \h, identity, identity, 43 +def_fn_816 \w, \h, dct, adst, 43 +def_fn_816 \w, \h, dct, flipadst, 43 +def_fn_816 \w, \h, dct, identity, 8 +def_fn_816 \w, \h, adst, dct, 43 +def_fn_816 \w, \h, adst, adst, 43 +def_fn_816 \w, \h, adst, flipadst, 43 +def_fn_816 \w, \h, flipadst, dct, 43 +def_fn_816 \w, \h, flipadst, adst, 43 +def_fn_816 \w, \h, flipadst, flipadst, 43 +def_fn_816 \w, \h, identity, dct, 64 +def_fn_816 \w, \h, adst, identity, 8 +def_fn_816 \w, \h, flipadst, identity, 8 +def_fn_816 \w, \h, identity, adst, 64 +def_fn_816 \w, \h, identity, flipadst, 64 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 diff --git a/third_party/dav1d/src/riscv/asm.S b/third_party/dav1d/src/riscv/asm.S index 2435170acb..eed4d67bf5 100644 --- a/third_party/dav1d/src/riscv/asm.S +++ b/third_party/dav1d/src/riscv/asm.S @@ -123,4 +123,6 @@ EXTERN\name: end_thread_local .endm +#define L(x) .L ## x + #endif /* DAV1D_SRC_RISCV_ASM_S */ diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h index 28c5e54d42..d3f9a03a03 100644 --- a/third_party/dav1d/src/riscv/itx.h +++ b/third_party/dav1d/src/riscv/itx.h @@ -58,7 +58,13 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); @@ -105,7 +111,13 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in #if BITDEPTH == 8 assign_itx17_fn( , 4, 4, rvv); + assign_itx16_fn(R, 4, 8, rvv); + assign_itx16_fn(R, 4, 16, rvv); + assign_itx16_fn(R, 8, 4, rvv); assign_itx16_fn( , 8, 8, rvv); + assign_itx16_fn(R, 8, 16, rvv); + assign_itx16_fn(R, 16, 4, rvv); + assign_itx16_fn(R, 16, 8, rvv); assign_itx12_fn( , 16, 16, rvv); #endif } diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm index 1f30f8a3b7..95d35fc1c8 100644 --- a/third_party/dav1d/src/x86/cdef_avx2.asm +++ b/third_party/dav1d/src/x86/cdef_avx2.asm @@ -398,7 +398,6 @@ SECTION .text INIT_YMM avx2 cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge -%assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block @@ -1195,9 +1194,9 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ .border_block: DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge -%define rstk rsp -%assign stack_offset stack_offset_entry -%assign regs_used 11 + RESET_STACK_STATE + %assign stack_offset stack_offset - (regs_used - 11) * gprsize + %assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm index a1d4c41f27..eda6035923 100644 --- a/third_party/dav1d/src/x86/filmgrain16_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm @@ -646,18 +646,9 @@ INIT_XMM avx2 INIT_YMM avx2 .ar2: %if WIN64 - ; xmm6 and xmm7 already saved - %assign xmm_regs_used 13 + %2 %assign stack_size_padded 136 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 -%if %2 - movaps [rsp+16*7], xmm13 -%endif + WIN64_PUSH_XMM 13 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] @@ -747,20 +738,10 @@ INIT_YMM avx2 .ar3: %if WIN64 - ; xmm6 and xmm7 already saved %assign stack_offset 32 - %assign xmm_regs_used 14 + %2 %assign stack_size_padded 152 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 - movaps [rsp+16*7], xmm13 -%if %2 - movaps [rsp+16*8], xmm14 -%endif + WIN64_PUSH_XMM 14 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm index 6b0daaac0b..25d01caa19 100644 --- a/third_party/dav1d/src/x86/filmgrain16_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm @@ -275,7 +275,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax .ar2: %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, bdmax, shift @@ -428,7 +427,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax %elif ARCH_X86_64 %define tmp rsp+stack_offset-72 %else -%assign stack_offset stack_offset_old ALLOC_STACK -16*12 %define tmp rsp mov bdmaxd, bdmaxm @@ -715,7 +713,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset_old stack_offset ALLOC_STACK -16*2 mov bufyq, r1m mov uvd, r3m @@ -831,9 +828,7 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x %else -%assign stack_offset stack_offset_old -%xdefine rstk rsp -%assign stack_size_padded 0 + RESET_STACK_STATE DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 mov bufyq, r1m mov uvd, r3m @@ -1159,7 +1154,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %endif %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset stack_offset_old ALLOC_STACK -16*14 mov bufyq, r1m mov uvd, r3m diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm index 55445cf593..91d8ca5c14 100644 --- a/third_party/dav1d/src/x86/filmgrain_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm @@ -204,18 +204,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data .ar2: %if WIN64 - ; xmm6 and xmm7 already saved - %assign xmm_regs_used 16 %assign stack_size_padded 168 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 - movaps [rsp+16*7], xmm13 - movaps [rsp+16*8], xmm14 - movaps [rsp+16*9], xmm15 + WIN64_PUSH_XMM 16, 8 %endif DEFINE_ARGS buf, fg_data, h, x mov r6d, [fg_dataq+FGData.ar_coeff_shift] @@ -287,15 +278,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data INIT_YMM avx2 .ar3: %if WIN64 - ; xmm6 and xmm7 already saved - %assign stack_offset 16 ALLOC_STACK 16*14 %assign stack_size stack_size - 16*4 - %assign xmm_regs_used 12 - movaps [rsp+16*12], xmm8 - movaps [rsp+16*13], xmm9 - movaps [rsp+16*14], xmm10 - movaps [rsp+16*15], xmm11 + WIN64_PUSH_XMM 12, 8 %else ALLOC_STACK 16*12 %endif diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm index 0172f98760..d06e349a8c 100644 --- a/third_party/dav1d/src/x86/filmgrain_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain_sse.asm @@ -232,7 +232,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data .ar2: %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, shift @@ -333,7 +332,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data .ar3: DEFINE_ARGS buf, fg_data, shift %if ARCH_X86_32 -%assign stack_offset stack_offset_old ALLOC_STACK -16*14 %elif WIN64 SUB rsp, 16*6 @@ -601,7 +599,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -2*16 %endif imul uvd, 28 @@ -738,9 +735,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar1: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp + RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x imul uvd, 28 @@ -881,9 +876,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar2: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp ALLOC_STACK -8*16 %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift @@ -1014,9 +1006,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar3: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp + RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm index f4931e977b..7b52abaa10 100644 --- a/third_party/dav1d/src/x86/ipred16_avx2.asm +++ b/third_party/dav1d/src/x86/ipred16_avx2.asm @@ -946,7 +946,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w4_loop RET .w8: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vpbroadcastw m0, [tlq] ; bottom vbroadcasti128 m7, [tlq+hq*2+2] @@ -974,7 +973,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w8_loop RET .w16: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] @@ -1005,7 +1003,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w16_loop RET .w32: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] @@ -1047,7 +1044,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w32_loop RET .w64: -%assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base mov dst_baseq, dstq mov tl_baseq, tlq @@ -1104,7 +1100,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights RET cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z1_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem @@ -1312,7 +1307,6 @@ ALIGN function_align .w4_end: RET .w8: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 7 lea r3d, [angleq+216] mov r3b, hb @@ -1476,7 +1470,6 @@ ALIGN function_align or maxbased, 16 ; imin(h+15, 31) jmp .w16_main .w16: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 7 lea maxbased, [hq+15] test angled, 0x400 @@ -1622,7 +1615,6 @@ ALIGN function_align .w16_end: RET .w32: - %assign stack_offset org_stack_offset ALLOC_STACK -160, 8 lea maxbased, [hq+31] mov r3d, 63 @@ -1737,7 +1729,6 @@ ALIGN function_align .w32_end: RET .w64: - %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [hq+63] test angled, 0x400 @@ -2691,7 +2682,6 @@ ALIGN function_align jmp .w32_filter_above cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z3_16bpc_avx2_table] tzcnt hd, hm movifnidn angled, anglem @@ -2907,7 +2897,6 @@ ALIGN function_align RET .h8: lea r4d, [angleq+216] - %assign stack_offset org_stack_offset ALLOC_STACK -64, 8 mov r4b, wb lea r7, [strideq*3] @@ -3155,7 +3144,6 @@ ALIGN function_align jmp .h16_main ALIGN function_align .h16: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 10 lea maxbased, [wq+15] lea r7, [strideq*3] @@ -3372,7 +3360,6 @@ ALIGN function_align .h16_end: RET .h32: - %assign stack_offset org_stack_offset ALLOC_STACK -160, 9 lea maxbased, [wq+31] and maxbased, 31 @@ -3557,7 +3544,6 @@ ALIGN function_align .h32_end: RET .h64: - %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [wq+63] test angled, 0x400 @@ -3804,7 +3790,6 @@ ALIGN function_align ; 5 8 8 i cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter -%assign org_stack_offset stack_offset %define base r6-ipred_filter_16bpc_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm @@ -3846,7 +3831,6 @@ cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vbroadcasti128 m14, [base+filter_shuf3] vpbroadcastw m15, r8m ; bitdepth_max @@ -3883,7 +3867,6 @@ ALIGN function_align RET ALIGN function_align .w16: - %assign stack_offset stack_offset - stack_size_padded ALLOC_STACK 32, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 @@ -3977,7 +3960,6 @@ ALIGN function_align ret ALIGN function_align .w32: - %assign stack_offset org_stack_offset ALLOC_STACK 64, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm index 58e40935ac..35738e7c0b 100644 --- a/third_party/dav1d/src/x86/ipred_avx2.asm +++ b/third_party/dav1d/src/x86/ipred_avx2.asm @@ -772,7 +772,6 @@ ALIGN function_align RET ALIGN function_align .w32: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 6 movu m3, [tlq+1] punpcklbw m2, m3, m5 @@ -823,29 +822,17 @@ ALIGN function_align jl .w64_loop RET -%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used - %assign stack_offset 0 - %assign stack_size_padded 0 - %assign regs_used %2 - %xdefine rstk rsp - SETUP_STACK_POINTER %1 - %if regs_used != %2 && WIN64 - PUSH r%2 - %endif - ALLOC_STACK %1, %3 -%endmacro - cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h -%define base r6-ipred_smooth_h_avx2_table - lea r6, [ipred_smooth_h_avx2_table] +%define base r5-ipred_smooth_h_avx2_table + lea r5, [ipred_smooth_h_avx2_table] mov wd, wm vpbroadcastb m3, [tlq+wq] ; right tzcnt wd, wd mov hd, hm - movsxd wq, [r6+wq*4] + movsxd wq, [r5+wq*4] vpbroadcastd m4, [base+pb_127_m127] vpbroadcastd m5, [base+pw_128] - add wq, r6 + add wq, r5 jmp wq .w4: WIN64_SPILL_XMM 8 @@ -891,7 +878,6 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 vbroadcasti128 m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] @@ -927,7 +913,7 @@ ALIGN function_align RET ALIGN function_align .w16: - SETUP_STACK_FRAME 32*4, 7, 8 + ALLOC_STACK 32*4, 8 lea r3, [rsp+64*2-4] call .prep ; only worthwhile for for w16 and above sub tlq, 2 @@ -951,7 +937,7 @@ ALIGN function_align RET ALIGN function_align .w32: - SETUP_STACK_FRAME 32*4, 7, 6 + ALLOC_STACK 32*4 lea r3, [rsp+64*2-2] call .prep dec tlq @@ -971,19 +957,19 @@ ALIGN function_align RET ALIGN function_align .w64: - SETUP_STACK_FRAME 32*4, 7, 9 + ALLOC_STACK 32*4, 9 lea r3, [rsp+64*2-2] call .prep - add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table + add r5, smooth_weights+16*15-ipred_smooth_h_avx2_table dec tlq - mova xm5, [r6-16*7] - vinserti128 m5, [r6-16*5], 1 - mova xm6, [r6-16*6] - vinserti128 m6, [r6-16*4], 1 - mova xm7, [r6-16*3] - vinserti128 m7, [r6-16*1], 1 - mova xm8, [r6-16*2] - vinserti128 m8, [r6-16*0], 1 + mova xm5, [r5-16*7] + vinserti128 m5, [r5-16*5], 1 + mova xm6, [r5-16*6] + vinserti128 m6, [r5-16*4], 1 + mova xm7, [r5-16*3] + vinserti128 m7, [r5-16*1], 1 + mova xm8, [r5-16*2] + vinserti128 m8, [r5-16*0], 1 .w64_loop: vpbroadcastb m2, [tlq+hq] punpcklbw m2, m3 @@ -1113,7 +1099,6 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vbroadcasti128 m11, [base+smooth_weights+8*2] @@ -1157,7 +1142,9 @@ ALIGN function_align RET ALIGN function_align .w16: - SETUP_STACK_FRAME 32*4, 7, 14 + %assign regs_used 4 + ALLOC_STACK -32*4, 14 + %assign regs_used 7 vbroadcasti128 m11, [tlq+1] lea r3, [rsp+64*2-4] punpcklbw m10, m11, m0 ; top, bottom @@ -1197,7 +1184,9 @@ ALIGN function_align RET ALIGN function_align .w32: - SETUP_STACK_FRAME 32*4, 7, 11 + %assign regs_used 4 + ALLOC_STACK -32*4, 11 + %assign regs_used 7 movu m8, [tlq+1] lea r3, [rsp+64*2-2] punpcklbw m7, m8, m0 @@ -1232,7 +1221,9 @@ ALIGN function_align RET ALIGN function_align .w64: - SETUP_STACK_FRAME 32*8, 7, 16 + %assign regs_used 4 + ALLOC_STACK -32*8, 16 + %assign regs_used 7 movu m13, [tlq+1 ] movu m15, [tlq+33] add r6, smooth_weights+16*15-ipred_smooth_avx2_table @@ -1316,7 +1307,6 @@ ALIGN function_align ret cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm movifnidn angled, anglem @@ -1415,7 +1405,6 @@ ALIGN function_align pmovmskb r5d, m1 ret .w4_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -16, 11 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter @@ -1522,7 +1511,6 @@ ALIGN function_align mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 - %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 movu xm2, [z_filter_s+6] mova xm0, [tlq-1] @@ -1592,7 +1580,6 @@ ALIGN function_align or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [hq+7] test angled, 0x400 @@ -1696,7 +1683,6 @@ ALIGN function_align jmp .w16_main ALIGN function_align .w16: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [hq+15] test angled, 0x400 @@ -1816,7 +1802,6 @@ ALIGN function_align RET ALIGN function_align .w32: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea r3d, [hq+31] mov maxbased, 63 @@ -1960,7 +1945,6 @@ ALIGN function_align RET ALIGN function_align .w64: - %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter @@ -3001,7 +2985,6 @@ ALIGN function_align jmp .w32_filter_above cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm movifnidn angled, anglem @@ -3102,7 +3085,6 @@ ALIGN function_align pmovmskb r5d, m1 ret .h4_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -16, 12 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter @@ -3215,7 +3197,6 @@ ALIGN function_align mov r4b, wb cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 - %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 and r4d, 4 mova xm0, [tlq-15] @@ -3297,7 +3278,6 @@ ALIGN function_align or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [wq+7] test angled, 0x400 @@ -3455,7 +3435,6 @@ ALIGN function_align jmp .h16_main ALIGN function_align .h16: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [wq+15] test angled, 0x400 @@ -3661,7 +3640,6 @@ ALIGN function_align RET ALIGN function_align .h32: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea maxbased, [wq+31] and maxbased, 31 @@ -3890,7 +3868,6 @@ ALIGN function_align RET ALIGN function_align .h64: - %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter @@ -4221,6 +4198,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter movzx filterd, byte filterm %endif shl filterd, 6 + WIN64_SPILL_XMM 9, 15 add filterq, r6 lea r6, [ipred_filter_avx2_table] movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 @@ -4234,7 +4212,6 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter mov hd, hm jmp wq .w4: - WIN64_SPILL_XMM 9 mova xm8, [base+filter_shuf2] sub tlq, 3 sub tlq, hq @@ -4251,8 +4228,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 10 + WIN64_PUSH_XMM 10 mova m8, [base+filter_shuf1] FILTER_XMM 7, 0, 6, [base+filter_shuf2] vpbroadcastd m0, [tlq+4] @@ -4278,26 +4254,18 @@ ALIGN function_align RET ALIGN function_align .w16: -%if WIN64 - %assign stack_offset stack_offset - stack_size_padded - %assign xmm_regs_used 15 - %assign stack_size_padded 0x98 - SUB rsp, stack_size_padded -%endif sub hd, 2 - TAIL_CALL .w16_main, 0 -.w16_main: + call .w16_main %if WIN64 - movaps [rsp+0xa8], xmm6 - movaps [rsp+0xb8], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 + jmp .end +%else + RET %endif +.w16_main: + ; The spills are into the callers stack frame + %assign stack_size stack_size + gprsize + WIN64_PUSH_XMM 15, 9 + %assign stack_size stack_size - gprsize FILTER_XMM 12, 0, 7, [base+filter_shuf2] vpbroadcastd m0, [tlq+5] vpblendd m0, [tlq-12], 0x14 @@ -4350,7 +4318,6 @@ ALIGN function_align ret ALIGN function_align .w32: - sub rsp, stack_size_padded sub hd, 2 lea r3, [dstq+16] lea r5d, [hq-2] @@ -4415,6 +4382,7 @@ ALIGN function_align shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 +.end: RET ALIGN function_align .main: diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm index 976f33a24b..f6b0cad001 100644 --- a/third_party/dav1d/src/x86/ipred_sse.asm +++ b/third_party/dav1d/src/x86/ipred_sse.asm @@ -670,10 +670,7 @@ ALIGN function_align RET ALIGN function_align .w32: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif + WIN64_PUSH_XMM 8, 7 mova m7, m5 .w32_loop_init: mov r3d, 2 @@ -705,10 +702,7 @@ ALIGN function_align RET ALIGN function_align .w64: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif + WIN64_PUSH_XMM 8, 7 mova m7, m5 .w64_loop_init: mov r3d, 4 diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm index 01eb6fa348..b5c73a51d4 100644 --- a/third_party/dav1d/src/x86/looprestoration_sse.asm +++ b/third_party/dav1d/src/x86/looprestoration_sse.asm @@ -42,7 +42,6 @@ pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_right_ext_mask: times 24 db 0xff times 8 db 0 pb_1: times 16 db 1 -pb_3: times 16 db 3 pw_256: times 8 dw 256 pw_2056: times 8 dw 2056 pw_m16380: times 8 dw -16380 @@ -290,7 +289,7 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v jmp .v1 .extend_right: - movd m2, [lpfq-4] + movd m2, [lpfq-1] %if ARCH_X86_64 push r0 lea r0, [pb_right_ext_mask+21] @@ -302,10 +301,11 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid movu m1, [r6+xq+8] %endif %if cpuflag(ssse3) - pshufb m2, [base+pb_3] + pxor m3, m3 + pshufb m2, m3 %else punpcklbw m2, m2 - pshuflw m2, m2, q3333 + pshuflw m2, m2, q0000 punpcklqdq m2, m2 %endif pand m4, m0 diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm index 61eeaa1007..42e2a5525e 100644 --- a/third_party/dav1d/src/x86/mc16_avx2.asm +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -1337,7 +1337,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp wd, 4 je .h_w4 jl .h_w2 - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 @@ -1415,7 +1414,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m @@ -1590,7 +1588,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .v_w8_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 @@ -2046,7 +2043,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] @@ -2125,7 +2121,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] @@ -2264,7 +2259,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endif RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm index 585ba53e08..e5de7ecd96 100644 --- a/third_party/dav1d/src/x86/mc16_avx512.asm +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -2377,7 +2377,6 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my jg .hv_w16_loop RET .hv_w32: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 32 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] @@ -3175,7 +3174,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 jg .hv_w8_loop RET .hv_w16: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 27 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 @@ -3313,7 +3311,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 RET .hv_w32: %if WIN64 - %assign stack_offset stack_offset - stack_size_padded PUSH r8 %assign regs_used regs_used + 1 WIN64_SPILL_XMM 32 diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm index fde8e372a3..b0c42597f7 100644 --- a/third_party/dav1d/src/x86/mc16_sse.asm +++ b/third_party/dav1d/src/x86/mc16_sse.asm @@ -1302,10 +1302,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .h_w4_loop RET .h_w8: -%if WIN64 - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 -%endif shr mxd, 16 movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp @@ -1383,14 +1380,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif -%if WIN64 WIN64_SPILL_XMM 15 -%endif movd m7, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp @@ -1604,11 +1594,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .v_w4_loop0 RET .hv: -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE %if ARCH_X86_32 movd m4, r8m mova m6, [base+pd_512] @@ -1750,11 +1736,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE mov dstq, dstmp mov dsq, dsmp mova m0, [base+spel_h_shufA] @@ -2182,11 +2164,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif WIN64_SPILL_XMM 15 movddup m7, [base+prep_8tap_1d_rnd] movifnidn ssq, r2mp @@ -2339,11 +2316,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my jg .v_loop0 RET .hv: -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE movzx t3d, mxb shr mxd, 16 cmp wd, 4 diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index 3b208033bd..58e3cb5af1 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -1259,7 +1259,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 @@ -1620,7 +1619,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_loop RET .v: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 @@ -1834,7 +1832,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .v_w16_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 cmp wd, 4 jg .hv_w8 @@ -2247,7 +2244,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_loop RET .v: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having @@ -2430,8 +2426,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w16_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded - %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 @@ -4108,10 +4102,9 @@ cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ beta, filter, tmp1, delta, my, gamma %if WIN64 - sub rsp, 0xa0 %assign xmm_regs_used 16 %assign stack_size_padded 0xa0 - %assign stack_offset stack_offset+stack_size_padded + SUB rsp, stack_size_padded %endif call .main jmp .start @@ -4134,21 +4127,13 @@ cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, RET ALIGN function_align .main: - ; Stack args offset by one (r4m -> r5m etc.) due to call -%if WIN64 - mov abcdq, r5m - mov mxd, r6m - movaps [rsp+stack_offset+0x10], xmm6 - movaps [rsp+stack_offset+0x20], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 - movaps [rsp+0x98], xmm15 -%endif + ; Stack is offset due to call + %assign stack_offset stack_offset + gprsize + %assign stack_size stack_size + gprsize + %assign stack_size_padded stack_size_padded + gprsize + movifnidn abcdq, abcdmp + movifnidn mxd, mxm + WIN64_PUSH_XMM movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] mova m12, [warp_8x8_shufA] @@ -4162,7 +4147,7 @@ ALIGN function_align lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 sub betad, tmp2d ; beta -= alpha*3 - mov myd, r7m + mov myd, r6m call .h psrld m1, m0, 16 call .h diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index 7897f1decc..f9043f1ad3 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -1276,7 +1276,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 @@ -2853,8 +2852,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded - %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm index 54939c647a..a447a80161 100644 --- a/third_party/dav1d/src/x86/mc_sse.asm +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -1199,7 +1199,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 RET .v: %if notcpuflag(ssse3) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] @@ -1375,7 +1374,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] -%assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 @@ -1592,7 +1590,6 @@ FN put_8tap, regular, REGULAR, REGULAR %endif cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 -%assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 @@ -1618,7 +1615,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movzx wd, word [base_reg+wq*2+table_offset(put,)] add wq, base_reg ; put_bilin mangling jump -%assign stack_offset org_stack_offset movifnidn dsq, dsmp movifnidn ssq, ssmp %if WIN64 @@ -1792,7 +1788,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else - %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 @@ -2048,7 +2043,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %undef subpel2 %undef subpel3 .hv: - %assign stack_offset org_stack_offset + RESET_STACK_STATE cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 @@ -2369,7 +2364,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %undef subpelv2 %undef subpelv3 .hv_w8: - %assign stack_offset org_stack_offset + RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 @@ -2843,7 +2838,6 @@ FN prep_8tap, regular, REGULAR, REGULAR %define base 0 %endif cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 -%assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 @@ -2862,7 +2856,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] - %assign stack_offset org_stack_offset %if WIN64 pop r8 pop r7 @@ -3095,7 +3088,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov mxd, myd and mxd, 0x7f %else - %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb %endif @@ -3359,7 +3351,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel2 %undef subpel3 .hv: - %assign stack_offset org_stack_offset + RESET_STACK_STATE cmp wd, 4 jg .hv_w8 and mxd, 0x7f @@ -3659,7 +3651,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpelv2 %undef subpelv3 .hv_w8: - %assign stack_offset org_stack_offset + RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm index 9f05c921a6..4156efe914 100644 --- a/third_party/dav1d/src/x86/msac.asm +++ b/third_party/dav1d/src/x86/msac.asm @@ -143,10 +143,9 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov esp, [esp] %endif %endif - not t4 sub t2d, t1d ; rng shl t1, gprsize*8-16 - add t4, t1 ; ~dif + sub t4, t1 ; dif - v .renorm3: mov t1d, [t0+msac.cnt] movifnidn t7, t0 @@ -157,33 +156,31 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 shl t2d, cl shl t4, cl mov [t7+msac.rng], t2d - not t4 sub t1d, ecx jae .end ; no refill required ; refill: - mov t2, [t7+msac.buf] - mov rcx, [t7+msac.end] %if ARCH_X86_64 == 0 push t5 %endif - lea t5, [t2+gprsize] - cmp t5, rcx + mov t2, [t7+msac.buf] + mov t5, [t7+msac.end] + lea rcx, [t2+gprsize] + sub rcx, t5 ja .refill_eob - mov t2, [t2] - lea ecx, [t1+23] - add t1d, 16 - shr ecx, 3 ; shift_bytes - bswap t2 - sub t5, rcx - shl ecx, 3 ; shift_bits - shr t2, cl - sub ecx, t1d ; shift_bits - 16 - cnt - mov t1d, gprsize*8-16 - shl t2, cl - mov [t7+msac.buf], t5 - sub t1d, ecx ; cnt + gprsize*8 - shift_bits - xor t4, t2 + mov t5, [t2] + lea ecx, [t1+16-gprsize*8] + not t5 + bswap t5 + shr t5, cl + neg ecx + shr ecx, 3 ; num_bytes_read + or t4, t5 +.refill_end: + add t2, rcx + lea t1d, [t1+rcx*8] ; cnt += num_bits_read + mov [t7+msac.buf], t2 +.refill_end2: %if ARCH_X86_64 == 0 pop t5 %endif @@ -191,29 +188,35 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov [t7+msac.cnt], t1d mov [t7+msac.dif], t4 RET +.pad_with_ones: + lea ecx, [t1-16] +%if ARCH_X86_64 + ror rcx, cl +%else + shr ecx, cl +%endif + or t4, rcx + jmp .refill_end2 .refill_eob: ; avoid overreading the input buffer - mov t5, rcx - mov ecx, gprsize*8-24 - sub ecx, t1d ; c -.refill_eob_loop: cmp t2, t5 - jae .refill_eob_end ; eob reached - movzx t1d, byte [t2] - inc t2 - shl t1, cl - xor t4, t1 - sub ecx, 8 - jge .refill_eob_loop -.refill_eob_end: - mov t1d, gprsize*8-24 -%if ARCH_X86_64 == 0 - pop t5 -%endif - sub t1d, ecx - mov [t7+msac.buf], t2 - mov [t7+msac.dif], t4 - mov [t7+msac.cnt], t1d - RET + jae .pad_with_ones ; eob reached + ; We can safely do a register-sized load of the last bytes of the buffer + ; as this code is only reached if the msac buffer size is >= gprsize. + mov t5, [t5-gprsize] + shl ecx, 3 + shr t5, cl + lea ecx, [t1+16-gprsize*8] + not t5 + bswap t5 + shr t5, cl + neg ecx + or t4, t5 + mov t5d, [t7+msac.end] + shr ecx, 3 + sub t5d, t2d ; num_bytes_left + cmp ecx, t5d + cmovae ecx, t5d ; num_bytes_read + jmp .refill_end cglobal msac_decode_symbol_adapt8, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT @@ -366,7 +369,6 @@ cglobal msac_decode_bool_adapt, 0, 6, 0 %if ARCH_X86_64 == 0 movzx eax, al %endif - not t4 test t3d, t3d jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 %if UNIX64 == 0 @@ -420,7 +422,6 @@ cglobal msac_decode_bool_equi, 0, 6, 0 mov ecx, 0xbfff setb al ; the upper 32 bits contains garbage but that's OK sub ecx, t2d - not t4 ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) ; i.e. (0 <= d <= 2) and v < (3 << 14) shr ecx, 14 ; d @@ -447,7 +448,6 @@ cglobal msac_decode_bool, 0, 6, 0 cmovb t2d, t1d cmovb t4, t3 setb al - not t4 %if ARCH_X86_64 == 0 movzx eax, al %endif @@ -497,48 +497,45 @@ cglobal msac_decode_bool, 0, 6, 0 tzcnt eax, eax movzx ecx, word [buf+rax+16] movzx t2d, word [buf+rax+14] - not t4 %if ARCH_X86_64 add t6d, 5 %endif sub eax, 5 ; setup for merging the tok_br and tok branches sub t2d, ecx shl rcx, gprsize*8-16 - add t4, rcx + sub t4, rcx bsr ecx, t2d xor ecx, 15 shl t2d, cl shl t4, cl movd m2, t2d mov [t7+msac.rng], t2d - not t4 sub t5d, ecx jae %%end - mov t2, [t7+msac.buf] - mov rcx, [t7+msac.end] %if UNIX64 == 0 push t8 %endif - lea t8, [t2+gprsize] - cmp t8, rcx + mov t2, [t7+msac.buf] + mov t8, [t7+msac.end] + lea rcx, [t2+gprsize] + sub rcx, t8 ja %%refill_eob - mov t2, [t2] - lea ecx, [t5+23] - add t5d, 16 + mov t8, [t2] + lea ecx, [t5+16-gprsize*8] + not t8 + bswap t8 + shr t8, cl + neg ecx shr ecx, 3 - bswap t2 - sub t8, rcx - shl ecx, 3 - shr t2, cl - sub ecx, t5d - mov t5d, gprsize*8-16 - shl t2, cl - mov [t7+msac.buf], t8 + or t4, t8 +%%refill_end: + add t2, rcx + lea t5d, [t5+rcx*8] + mov [t7+msac.buf], t2 +%%refill_end2: %if UNIX64 == 0 pop t8 %endif - sub t5d, ecx - xor t4, t2 %%end: movp m3, t4 %if ARCH_X86_64 @@ -559,27 +556,34 @@ cglobal msac_decode_bool, 0, 6, 0 shr eax, 1 mov [t7+msac.cnt], t5d RET +%%pad_with_ones: + ; ensure that dif is padded with at least 15 bits of ones at the end + lea ecx, [t5-16] +%if ARCH_X86_64 + ror rcx, cl +%else + shr ecx, cl +%endif + or t4, rcx + jmp %%refill_end2 %%refill_eob: - mov t8, rcx - mov ecx, gprsize*8-24 - sub ecx, t5d -%%refill_eob_loop: cmp t2, t8 - jae %%refill_eob_end - movzx t5d, byte [t2] - inc t2 - shl t5, cl - xor t4, t5 - sub ecx, 8 - jge %%refill_eob_loop -%%refill_eob_end: -%if UNIX64 == 0 - pop t8 -%endif - mov t5d, gprsize*8-24 - mov [t7+msac.buf], t2 - sub t5d, ecx - jmp %%end + jae %%pad_with_ones + mov t8, [t8-gprsize] + shl ecx, 3 + shr t8, cl + lea ecx, [t5+16-gprsize*8] + not t8 + bswap t8 + shr t8, cl + neg ecx + or t4, t8 + mov t8d, [t7+msac.end] + shr ecx, 3 + sub t8d, t2d + cmp ecx, t8d + cmovae ecx, t8d + jmp %%refill_end %endmacro cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 |