summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d')
-rw-r--r--third_party/dav1d/NEWS18
-rw-r--r--third_party/dav1d/THANKS.md33
-rw-r--r--third_party/dav1d/gcovr.cfg2
-rw-r--r--third_party/dav1d/meson.build77
-rw-r--r--third_party/dav1d/src/arm/32/itx.S79
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S19
-rw-r--r--third_party/dav1d/src/arm/32/msac.S167
-rw-r--r--third_party/dav1d/src/arm/64/itx.S99
-rw-r--r--third_party/dav1d/src/arm/64/itx16.S21
-rw-r--r--third_party/dav1d/src/arm/64/mc.S411
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S373
-rw-r--r--third_party/dav1d/src/arm/64/msac.S167
-rw-r--r--third_party/dav1d/src/arm/64/util.S49
-rw-r--r--third_party/dav1d/src/arm/asm.S44
-rw-r--r--third_party/dav1d/src/arm/cpu.c137
-rw-r--r--third_party/dav1d/src/arm/cpu.h4
-rw-r--r--third_party/dav1d/src/arm/itx.h4
-rw-r--r--third_party/dav1d/src/arm/msac.h2
-rw-r--r--third_party/dav1d/src/cpu.h14
-rw-r--r--third_party/dav1d/src/ext/x86/x86inc.asm198
-rw-r--r--third_party/dav1d/src/itx_1d.c5
-rw-r--r--third_party/dav1d/src/itx_tmpl.c10
-rw-r--r--third_party/dav1d/src/loongarch/msac.S216
-rw-r--r--third_party/dav1d/src/msac.c58
-rw-r--r--third_party/dav1d/src/ppc/cdef_tmpl.c399
-rw-r--r--third_party/dav1d/src/riscv/64/itx.S1061
-rw-r--r--third_party/dav1d/src/riscv/asm.S2
-rw-r--r--third_party/dav1d/src/riscv/itx.h12
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm7
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx2.asm23
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm8
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx2.asm19
-rw-r--r--third_party/dav1d/src/x86/filmgrain_sse.asm14
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm18
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm106
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm10
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm8
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm6
-rw-r--r--third_party/dav1d/src/x86/mc16_avx512.asm3
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm33
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm33
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm3
-rw-r--r--third_party/dav1d/src/x86/mc_sse.asm16
-rw-r--r--third_party/dav1d/src/x86/msac.asm172
44 files changed, 2867 insertions, 1293 deletions
diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS
index 3645474a04..88b1eea00e 100644
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@@ -1,3 +1,15 @@
+Changes for 1.4.1 'Road Runner':
+--------------------------------
+
+1.4.1 is a small release of dav1d, improving notably ARM and RISC-V speed
+
+- Optimizations for 6tap filters for NEON (ARM)
+- More RISC-V optimizations for itx (4x8, 8x4, 4x16, 16x4, 8x16, 16x8)
+- Reduction of binary size on ARM64, ARM32 and RISC-V
+- Fix out-of-bounds read in 8bpc SSE2/SSSE3 wiener_filter
+- Msac optimizations
+
+
Changes for 1.4.0 'Road Runner':
--------------------------------
@@ -26,7 +38,7 @@ Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)':
Changes for 1.2.1 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.2.1 is a small release of dav1d, adding more SIMD and fixes
@@ -42,7 +54,7 @@ Changes for 1.2.1 'Arctic Peregrine Falcon':
Changes for 1.2.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.2.0 is a small release of dav1d, adding more SIMD and fixes
@@ -55,7 +67,7 @@ Changes for 1.2.0 'Arctic Peregrine Falcon':
Changes for 1.1.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
1.1.0 is an important release of dav1d, fixing numerous bugs, and adding SIMD
diff --git a/third_party/dav1d/THANKS.md b/third_party/dav1d/THANKS.md
index 4fc8d27f14..b7aa200d0e 100644
--- a/third_party/dav1d/THANKS.md
+++ b/third_party/dav1d/THANKS.md
@@ -16,19 +16,20 @@ The Alliance for Open Media (AOM) for partially funding this project.
And all the dav1d Authors (git shortlog -sn), including:
-Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
-Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz,
-Jean-Baptiste Kempf, Luc Trudeau, Hugo Beauzée-Luyssen, Konstantin Pavlov,
-Niklas Haas, David Michael Barr, Steve Lhomme, Nathan E. Egge, Wan-Teh Chang,
-Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Luca Barbato,
-David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer,
-Xuefeng Jiang, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis,
-Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
-Thomas Daede, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber,
-Salome Thirot, SmilingWolf, Tristan Laurent, Vittorio Giovara, Yannis Guyon,
-André Kempe, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
-Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth,
-Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav
-Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
-Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
-Xu Guangxin, kossh1 and skal.
+Henrik Gramner, Martin Storsjö, Ronald S. Bultje, Janne Grunau, James Almer,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Nathan E. Egge,
+Jean-Baptiste Kempf, Marvin Scholz, Luc Trudeau, Niklas Haas,
+Hugo Beauzée-Luyssen, Konstantin Pavlov, David Michael Barr, Steve Lhomme,
+yuanhecai, Luca Barbato, Wan-Teh Chang, Kyle Siefring, B Krishnan Iyer,
+Francois Cartegnie, Liwei Wang, David Conrad, Derek Buitenhuis, Jan Beich,
+Michael Bradshaw, Raphaël Zumer, Xuefeng Jiang, Arpad Panyik, Christophe Gisquet,
+Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, Raphael Zumer,
+Rupert Swarbrick, Thierry Foucu, Thomas Daede, jinbo, André Kempe, Colin Lee,
+Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf,
+Tristan Laurent, Tristan Matthews, Vittorio Giovara, Yannis Guyon,
+Andrey Semashev, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
+Ewout ter Hoeven, Fred Barbier, Hao Chen, Jean-Yves Avenard, Joe Drago,
+Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
+Pablo Stebler, Rostislav Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen,
+Sylvain BERTRAND, Sylvestre Ledru, Timo Gurr, Vibhoothi,
+Vignesh Venkatasubramanian, Xavier Claessens, Xu Guangxin, kossh1 and skal.
diff --git a/third_party/dav1d/gcovr.cfg b/third_party/dav1d/gcovr.cfg
index d09a0ecab5..e02ae33c33 100644
--- a/third_party/dav1d/gcovr.cfg
+++ b/third_party/dav1d/gcovr.cfg
@@ -1,4 +1,4 @@
exclude = .*/tests/.*
exclude = .*/tools/.*
exclude = .*/include/common/dump.h
-gcov-ignore-parse-errors = yes
+gcov-ignore-parse-errors = negative_hits.warn
diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build
index 6e49852103..e371415d53 100644
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
- version: '1.4.0',
+ version: '1.4.1',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
@@ -309,6 +309,10 @@ if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang
optional_arguments += '-fno-stack-check'
endif
+if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))
+ optional_arguments += '-fno-align-functions'
+endif
+
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c')
@@ -365,6 +369,66 @@ if (is_asm_enabled and
if cc.compiles(check_pic_code)
cdata.set('PIC', '3')
endif
+
+ if host_machine.cpu_family() == 'aarch64'
+ have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
+ cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
+ as_arch_str = ''
+ if have_as_arch
+ as_arch_level = 'armv8-a'
+ # Check what .arch levels are supported. In principle, we only
+ # want to detect up to armv8.2-a here (binutils requires that
+ # in order to enable i8mm). However, older Clang versions
+ # (before Clang 17, and Xcode versions up to and including 15.0)
+ # didn't support controlling dotprod/i8mm extensions via
+ # .arch_extension, therefore try to enable a high enough .arch
+ # level as well, to implicitly make them available via that.
+ foreach arch : ['armv8.2-a', 'armv8.4-a', 'armv8.6-a']
+ if cc.compiles('__asm__ (".arch ' + arch + '\\n");')
+ as_arch_level = arch
+ endif
+ endforeach
+ # Clang versions before 17 also had a bug
+ # (https://github.com/llvm/llvm-project/issues/32220)
+ # causing a plain ".arch <level>" to not have any effect unless it
+ # had an extra "+<feature>" included - but it was activated on the
+ # next ".arch_extension" directive instead. Check if we can include
+ # "+crc" as dummy feature to make the .arch directive behave as
+ # expected and take effect right away.
+ if cc.compiles('__asm__ (".arch ' + as_arch_level + '+crc\\n");')
+ as_arch_level = as_arch_level + '+crc'
+ endif
+ cdata.set('AS_ARCH_LEVEL', as_arch_level)
+ as_arch_str = '".arch ' + as_arch_level + '\\n"'
+ endif
+ extensions = {
+ 'dotprod': 'udot v0.4s, v0.16b, v0.16b',
+ 'i8mm': 'usdot v0.4s, v0.16b, v0.16b',
+ 'sve': 'whilelt p0.s, x0, x1',
+ 'sve2': 'sqrdmulh z0.s, z0.s, z0.s',
+ }
+ foreach name, instr : extensions
+ # Test for support for the various extensions. First test if
+ # the assembler supports the .arch_extension directive for
+ # enabling/disabling the extension, then separately check whether
+ # the instructions themselves are supported. Even if .arch_extension
+ # isn't supported, we may be able to assemble the instructions
+ # if the .arch level includes support for them.
+ code = '__asm__ (' + as_arch_str
+ code += '".arch_extension ' + name + '\\n"'
+ code += ');'
+ supports_archext = cc.compiles(code)
+ cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
+ code = '__asm__ (' + as_arch_str
+ if supports_archext
+ code += '".arch_extension ' + name + '\\n"'
+ endif
+ code += '"' + instr + '\\n"'
+ code += ');'
+ supports_instr = cc.compiles(code, name: name.to_upper())
+ cdata.set10('HAVE_' + name.to_upper(), supports_instr)
+ endforeach
+ endif
endif
cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
@@ -477,6 +541,17 @@ if (is_asm_enabled and
])
endif
+if is_asm_enabled and host_machine.cpu_family().startswith('riscv')
+ as_option_code = '''__asm__ (
+".option arch, +v\n"
+"vsetivli zero, 0, e8, m1, ta, ma"
+);
+'''
+ if not cc.compiles(as_option_code, name : 'RISC-V Vector')
+ error('Compiler doesn\'t support \'.option arch\' asm directive. Update to binutils>=2.38 or clang>=17 or use \'-Denable_asm=false\'.')
+ endif
+endif
+
# Generate config.h
config_h_target = configure_file(output: 'config.h', configuration: cdata)
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
index ceea025e45..9ba1df7a68 100644
--- a/third_party/dav1d/src/arm/32/itx.S
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -965,6 +965,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blx r4
@@ -976,8 +978,8 @@ function inv_txfm_\variant\()add_8x8_neon
vrshr.s16 q13, q13, #1
vrshr.s16 q14, q14, #1
vrshr.s16 q15, q15, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
blx r5
@@ -985,11 +987,12 @@ function inv_txfm_\variant\()add_8x8_neon
load_add_store_8x8 r0, r7
vpop {q4-q7}
pop {r4-r5,r7,pc}
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1444,14 +1447,16 @@ function inv_txfm_horz\suffix\()_16x4_neon
.else
identity_4x16_shift1 d0[0]
.endif
+ b L(horz_16x4_epilog)
.else
blx r4
-.endif
-.if \shift > 0
.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x4h q8, q9, d16, d17, d18, d19
transpose_4x4h q10, q11, d20, d21, d22, d23
transpose_4x4h q12, q13, d24, d25, d26, d27
@@ -1462,13 +1467,15 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
pop {pc}
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
-def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
-def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
@@ -1597,6 +1604,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_4x16_shift1 d0[0]
+
+ b L(itx_16x4_epilog)
.else
vmov.i16 q2, #0
vmov.i16 q3, #0
@@ -1615,30 +1624,25 @@ function inv_txfm_\variant\()add_16x4_neon
vswp d19, d22
vswp d18, d20
vswp d19, d21
-.irp i, q8, q9, q10, q11
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
vrshr.s16 \i, \i, #1
.endr
-.endif
+
+L(itx_16x4_epilog):
transpose_4x8h q8, q9, q10, q11
blx r5
mov r6, r0
load_add_store_8x4 r6, r7
-.ifc \variant, identity_
vmov q8, q12
vmov q9, q13
vmov q10, q14
vmov q11, q15
-.else
- vswp d25, d28
- vswp d27, d30
- vswp d26, d28
- vswp d27, d29
- vrshr.s16 q8, q12, #1
- vrshr.s16 q9, q13, #1
- vrshr.s16 q10, q14, #1
- vrshr.s16 q11, q15, #1
-.endif
+
transpose_4x8h q8, q9, q10, q11
blx r5
add r6, r0, #8
@@ -1646,6 +1650,7 @@ function inv_txfm_\variant\()add_16x4_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1696,12 +1701,14 @@ function inv_txfm_\variant\()add_4x16_neon
movw r12, #(5793-4096)*8
vdup.16 d0, r12
identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+
+ b L(itx_4x16_epilog)
.else
blx r4
.irp i, q8, q9, q10, q11
vrshr.s16 \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h q8, q9, q10, q11
vswp d19, d21
vswp d18, d20
@@ -1714,11 +1721,12 @@ function inv_txfm_\variant\()add_4x16_neon
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1728,11 +1736,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 4
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
mov r10, #\eob_half
.else
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1765,8 +1777,7 @@ def_fn_416 \w, \h, identity, flipadst, 32
def_fns_416 4, 16
def_fns_416 16, 4
-.macro def_fn_816_base variant
-function inv_txfm_\variant\()add_16x8_neon
+function inv_txfm_add_16x8_neon
sub_sp_align 256
.irp i, 0, 4
@@ -1805,6 +1816,7 @@ function inv_txfm_\variant\()add_16x8_neon
pop {r4-r11,pc}
endfunc
+.macro def_fn_816_base variant
function inv_txfm_\variant\()add_8x16_neon
sub_sp_align 256
@@ -1849,6 +1861,10 @@ function inv_txfm_\variant\()add_8x16_neon
.endr
2:
+.ifc \variant, identity_
+ b L(itx_8x16_epilog)
+.else
+L(itx_8x16_epilog):
.irp i, 0, 4
add r6, r0, #(\i)
add r7, sp, #(\i*2)
@@ -1859,11 +1875,18 @@ function inv_txfm_\variant\()add_8x16_neon
add_sp_align 256
vpop {q4-q7}
pop {r4-r11,pc}
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
+
+/* Define symbols used in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1873,7 +1896,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
.if \w == 8
+.ifnc \txfm1, identity
movrel_local r4, inv_\txfm1\()_8h_x8_neon
+.endif
movrel_local r5, inv_\txfm2\()_4h_x16_neon
.else
.ifc \txfm1, identity
@@ -1889,7 +1914,7 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.else
mov r10, #\eob_4x4
.endif
-.ifc \txfm1, identity
+.if \w == 8 && \txfm1 == identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
index aa6c272e71..7691272517 100644
--- a/third_party/dav1d/src/arm/32/itx16.S
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -547,11 +547,11 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
vmov.i16 q15, #0
vld1.32 {q8, q9}, [r2, :128]
vst1.32 {q14, q15}, [r2, :128]!
- vshr.s16 q8, q8, #2
+ vshr.s32 q8, q8, #2
vld1.32 {q10, q11}, [r2, :128]
- vshr.s16 q9, q9, #2
- vshr.s16 q10, q10, #2
- vshr.s16 q11, q11, #2
+ vshr.s32 q9, q9, #2
+ vshr.s32 q10, q10, #2
+ vshr.s32 q11, q11, #2
iwht4
@@ -598,7 +598,9 @@ function inv_txfm_add_4x4_neon
vld1.16 {d3}, [r0, :64], r1
L(itx_4x4_end):
- vmvn.i16 q15, #0xfc00 // 0x3ff
+ // read bitdepth_max from the callers stack
+ ldr r4, [sp, #44]
+ vdup.i16 q15, r4
sub r0, r0, r1, lsl #2
vqadd.s16 q8, q8, q0
vqadd.s16 q9, q9, q1
@@ -1487,6 +1489,10 @@ function inv_txfm_horz\suffix\()_16x2_neon
vqrshrn.s32 d21, q13, #\shift
vqrshrn.s32 d22, q14, #\shift
vqrshrn.s32 d23, q15, #\shift
+.if \scale
+ b L(horz_16x2_epilog)
+.else
+L(horz_16x2_epilog):
vuzp.16 q8, q9
vuzp.16 q10, q11
@@ -1495,11 +1501,12 @@ function inv_txfm_horz\suffix\()_16x2_neon
.endr
pop {pc}
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_4x16_neon
push {lr}
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
index b06e109dda..b16957fb7e 100644
--- a/third_party/dav1d/src/arm/32/msac.S
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -279,60 +279,67 @@ L(renorm):
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
L(renorm2):
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 4f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 8 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 6f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+2: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 8 - c
-9:
+3: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+4: // end
str r6, [r0, #CNT]
str r7, [r0, #DIF]
-
mov r0, lr
add sp, sp, #48
-
pop {r4-r10,pc}
+
+5: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 3b
+
+6: // refill_eob
+ cmp r3, r4
+ bhs 5b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -414,53 +421,38 @@ function msac_decode_hi_tok_neon, export=1
sub r4, r4, r3 // rng = u - v
clz r5, r4 // clz(rng)
eor r5, r5, #16 // d = clz(rng) ^ 16
- mvn r7, r7 // ~dif
- add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ sub r7, r7, r3, lsl #16 // dif - (v << 16)
lsl r4, r4, r5 // rng << d
subs r6, r6, r5 // cnt -= d
- lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ lsl r7, r7, r5 // (dif - (v << 16)) << d
str r4, [r0, #RNG]
vdup.16 d1, r4
- mvn r7, r7 // ~dif
- bhs 9f
+ bhs 5f
// refill
ldr r3, [r0, #BUF_POS] // BUF_POS
ldr r4, [r0, #BUF_END] // BUF_END
add r5, r3, #4
- cmp r5, r4
- bgt 2f
-
- ldr r3, [r3] // next_bits
- add r8, r6, #23 // shift_bits = cnt + 23
- add r6, r6, #16 // cnt += 16
- rev r3, r3 // next_bits = bswap(next_bits)
- sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
- and r8, r8, #24 // shift_bits &= 24
- lsr r3, r3, r8 // next_bits >>= shift_bits
- sub r8, r8, r6 // shift_bits -= 16 + cnt
- str r5, [r0, #BUF_POS]
- lsl r3, r3, r8 // next_bits <<= shift_bits
- rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
- eor r7, r7, r3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- rsb r5, r6, #8 // c = 40 - cnt
-3:
- cmp r3, r4
- bge 4f
- ldrb r8, [r3], #1
- lsl r8, r8, r5
- eor r7, r7, r8
- subs r5, r5, #8
- bge 3b
-
-4: // refill_eob_end
+ subs r5, r5, r4
+ bhi 7f
+
+ ldr r8, [r3] // next_bits
+ rsb r5, r6, #16
+ add r4, r6, #16 // shift_bits = cnt + 16
+ mvn r8, r8
+ lsr r5, r5, #3 // num_bytes_read
+ rev r8, r8 // next_bits = bswap(next_bits)
+ lsr r8, r8, r4 // next_bits >>= shift_bits
+
+3: // refill_end
+ add r3, r3, r5
+ add r6, r6, r5, lsl #3 // cnt += num_bits_read
str r3, [r0, #BUF_POS]
- rsb r6, r5, #8 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr r7, r7, r8 // dif |= next_bits
+
+5: // end
lsl lr, lr, #1
sub lr, lr, #5
lsr r12, r7, #16
@@ -473,6 +465,30 @@ function msac_decode_hi_tok_neon, export=1
str r7, [r0, #DIF]
lsr r0, r2, #1
pop {r4-r10,pc}
+
+6: // pad_with_ones
+ add r8, r6, #-240
+ lsr r8, r8, r8
+ b 4b
+
+7: // refill_eob
+ cmp r3, r4
+ bhs 6b
+
+ ldr r8, [r4, #-4]
+ lsl r5, r5, #3
+ lsr r8, r8, r5
+ add r5, r6, #16
+ mvn r8, r8
+ sub r4, r4, r3 // num_bytes_left
+ rev r8, r8
+ lsr r8, r8, r5
+ rsb r5, r6, #16
+ lsr r5, r5, #3
+ cmp r5, r4
+ it hs
+ movhs r5, r4
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -493,7 +509,6 @@ function msac_decode_bool_equi_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -519,7 +534,6 @@ function msac_decode_bool_neon, export=1
movhs r7, r8 // if (ret) dif = dif - vw;
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
b L(renorm2)
@@ -549,7 +563,6 @@ function msac_decode_bool_adapt_neon, export=1
cmp r10, #0
clz r5, r4 // clz(rng)
- mvn r7, r7 // ~dif
eor r5, r5, #16 // d = clz(rng) ^ 16
mov lr, r2
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
index 53490cd677..7063cbde1d 100644
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -879,6 +879,8 @@ function inv_txfm_\variant\()add_8x8_neon
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x8_epilog)
.else
blr x4
@@ -890,19 +892,20 @@ function inv_txfm_\variant\()add_8x8_neon
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
-.endif
+L(itx_8x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
blr x5
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1390,14 +1393,16 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endif
.if \identity
identity_8x16_shift2 v0.h[0]
+ b L(horz_16x8_epilog)
.else
blr x4
-.endif
-.if \shift > 0
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #\shift
.endr
-.endif
+.if \shift == 1
+ b L(horz_16x8_epilog)
+.else
+L(horz_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
@@ -1406,12 +1411,14 @@ function inv_txfm_horz\suffix\()_16x8_neon
.endr
ret x14
+.endif
+.endif
endfunc
.endm
-def_horz_16 scale=0, identity=0, shift=2
def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+def_horz_16 scale=0, identity=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
@@ -1512,6 +1519,8 @@ function inv_txfm_\variant\()add_16x4_neon
.endr
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x4_epilog)
.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
@@ -1527,33 +1536,29 @@ function inv_txfm_\variant\()add_16x4_neon
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
- transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
- blr x5
- mov x6, x0
- load_add_store_8x4 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v20.16b
- mov v17.16b, v21.16b
- mov v18.16b, v22.16b
- mov v19.16b, v23.16b
-.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
ins v27.d[1], v31.d[0]
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
-.endif
+ srshr v20.8h, v24.8h, #1
+ srshr v21.8h, v25.8h, #1
+ srshr v22.8h, v26.8h, #1
+ srshr v23.8h, v27.8h, #1
+
+L(itx_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19
+ blr x5
add x6, x0, #8
load_add_store_8x4 x6, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_4x16_neon
@@ -1605,12 +1610,14 @@ function inv_txfm_\variant\()add_4x16_neon
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+
+ b L(itx_4x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_4x16_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
@@ -1622,11 +1629,12 @@ function inv_txfm_\variant\()add_4x16_neon
load_add_store_4x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_416_base
def_fn_416_base identity_
+def_fn_416_base
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1634,11 +1642,15 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
idct_dc \w, \h, 1
.endif
.if \w == 4
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_4h_x\h\()_neon
mov w13, #\eob_half
.else
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_4h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
@@ -1690,13 +1702,16 @@ function inv_txfm_\variant\()add_16x8_neon
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift1 v0.h[0]
+
+ b L(itx_16x8_epilog)
.else
blr x4
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
srshr \i, \i, #1
.endr
-.endif
+
+L(itx_16x8_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1704,27 +1719,7 @@ function inv_txfm_\variant\()add_16x8_neon
mov x6, x0
load_add_store_8x8 x6, x7
-.ifc \variant, identity_
- mov v16.16b, v24.16b
- mov v17.16b, v25.16b
- mov v18.16b, v26.16b
- mov v19.16b, v27.16b
- mov v20.16b, v28.16b
- mov v21.16b, v29.16b
- mov v22.16b, v30.16b
- mov v23.16b, v31.16b
-.else
- srshr v16.8h, v24.8h, #1
- srshr v17.8h, v25.8h, #1
- srshr v18.8h, v26.8h, #1
- srshr v19.8h, v27.8h, #1
- srshr v20.8h, v28.8h, #1
- srshr v21.8h, v29.8h, #1
- srshr v22.8h, v30.8h, #1
- srshr v23.8h, v31.8h, #1
-.endif
-
- transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
blr x5
@@ -1732,6 +1727,7 @@ function inv_txfm_\variant\()add_16x8_neon
load_add_store_8x8 x0, x7
ret x15
+.endif
endfunc
function inv_txfm_\variant\()add_8x16_neon
@@ -1790,14 +1786,16 @@ function inv_txfm_\variant\()add_8x16_neon
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
+
+ b L(itx_8x16_epilog)
.else
blr x4
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
-.endif
+L(itx_8x16_epilog):
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
@@ -1805,18 +1803,21 @@ function inv_txfm_\variant\()add_8x16_neon
load_add_store_8x16 x0, x6
ret x15
+.endif
endfunc
.endm
-def_fn_816_base
def_fn_816_base identity_
+def_fn_816_base
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
+.ifnc \txfm1, identity
adr x4, inv_\txfm1\()_8h_x\w\()_neon
+.endif
adr x5, inv_\txfm2\()_8h_x\h\()_neon
.if \w == 8
mov x13, #\eob_half
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
index eee3a9636d..31ee9be1b4 100644
--- a/third_party/dav1d/src/arm/64/itx16.S
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -514,13 +514,17 @@ function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
endfunc
+// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
+// x0-x4 external parameters
+// x5 function pointer to first transform
+// x6 function pointer to second transform
function inv_txfm_add_4x4_neon
movi v30.4s, #0
movi v31.4s, #0
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
st1 {v30.4s, v31.4s}, [x2], #32
- blr x4
+ blr x5
st1 {v30.4s, v31.4s}, [x2], #32
sqxtn v16.4h, v16.4s
@@ -529,7 +533,7 @@ function inv_txfm_add_4x4_neon
sqxtn v19.4h, v19.4s
transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
- blr x5
+ blr x6
ld1 {v0.d}[0], [x0], x1
ld1 {v0.d}[1], [x0], x1
@@ -541,7 +545,7 @@ function inv_txfm_add_4x4_neon
srshr v18.8h, v18.8h, #4
L(itx_4x4_end):
- mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ dup v31.8h, w4
sub x0, x0, x1, lsl #2
usqadd v0.8h, v16.8h
usqadd v1.8h, v18.8h
@@ -579,8 +583,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
b L(itx_4x4_end)
1:
.endif
- adr x4, inv_\txfm1\()_4s_x4_neon
- movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ adr x5, inv_\txfm1\()_4s_x4_neon
+ movrel x6, X(inv_\txfm2\()_4h_x4_neon)
b inv_txfm_add_4x4_neon
endfunc
.endm
@@ -1381,6 +1385,10 @@ function inv_txfm_horz\suffix\()_16x4_neon
sqrshrn2 v21.8h, v29.4s, #\shift
sqrshrn2 v22.8h, v30.4s, #\shift
sqrshrn2 v23.8h, v31.4s, #\shift
+.if \scale
+ b L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
@@ -1389,11 +1397,12 @@ function inv_txfm_horz\suffix\()_16x4_neon
.endr
ret x14
+.endif
endfunc
.endm
-def_horz_16 scale=0, shift=2
def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
function inv_txfm_add_vert_8x16_neon
mov x14, x30
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
index 9f7b4e7a89..3df0393c3a 100644
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@ endfunc
uxtl \r6\().8h, \r6\().8b
.endif
.endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
mul \d\wd, \s0\wd, v0.h[0]
mla \d\wd, \s1\wd, v0.h[1]
mla \d\wd, \s2\wd, v0.h[2]
@@ -1163,7 +1163,51 @@ endfunc
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mul \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().4h, \s0\().4h, v0.h[0]
mla \d0\().4h, \s1\().4h, v0.h[1]
mla \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@ endfunc
mla \d0\().4h, \s6\().4h, v0.h[6]
mla \d0\().4h, \s7\().4h, v0.h[7]
.endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@ endfunc
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
.endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@ endfunc
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@ endfunc
.endif
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1328,18 +1372,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@@ -1354,12 +1388,12 @@ function \type\()_8tap_neon
tst \mx, #(0x7f << 14)
sub w8, w8, #24
movrel x10, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x10, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x9, L(\type\()_8tap_h_tbl)
+ adr x9, L(\type\()_\taps\()_h_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1471,6 +1505,18 @@ L(\type\()_8tap_h):
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
+.ifc \taps, 6tap
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v23.16b, v20.16b, v21.16b, #2
+ mul v18.8h, v19.8h, v0.h[1]
+ mul v22.8h, v23.8h, v0.h[1]
+.irpc i, 23456
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v18.8h, v16.8h, v0.h[0]
mul v22.8h, v20.8h, v0.h[0]
.irpc i, 1234567
@@ -1479,6 +1525,7 @@ L(\type\()_8tap_h):
mla v18.8h, v19.8h, v0.h[\i]
mla v22.8h, v23.8h, v0.h[\i]
.endr
+.endif
subs \h, \h, #2
srshr v18.8h, v18.8h, #2
srshr v22.8h, v22.8h, #2
@@ -1523,6 +1570,26 @@ L(\type\()_8tap_h):
uxtl v22.8h, v22.8b
16:
+.ifc \taps, 6tap
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v29.16b, v17.16b, v18.16b, #2
+ ext v30.16b, v20.16b, v21.16b, #2
+ ext v31.16b, v21.16b, v22.16b, #2
+ mul v24.8h, v28.8h, v0.h[1]
+ mul v25.8h, v29.8h, v0.h[1]
+ mul v26.8h, v30.8h, v0.h[1]
+ mul v27.8h, v31.8h, v0.h[1]
+.irpc i, 23456
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v16.8h, v0.h[0]
mul v25.8h, v17.8h, v0.h[0]
mul v26.8h, v20.8h, v0.h[0]
@@ -1537,6 +1604,7 @@ L(\type\()_8tap_h):
mla v26.8h, v30.8h, v0.h[\i]
mla v27.8h, v31.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
srshr v26.8h, v26.8h, #2
@@ -1575,18 +1643,18 @@ L(\type\()_8tap_h):
b.gt 161b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1595,7 +1663,7 @@ L(\type\()_8tap_v):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_v_tbl)
+ adr x9, L(\type\()_\taps\()_v_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1620,7 +1688,7 @@ L(\type\()_8tap_v):
interleave_1_h v1, v2, v3, v4, v5
b.gt 24f
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .4h
+ mul_mla_4tap v6, v1, v2, v3, v4, .4h
sqrshrun_b 6, v6
st_h \d_strd, v6, 2
ret
@@ -1630,7 +1698,7 @@ L(\type\()_8tap_v):
interleave_1_h v5, v6, v7
interleave_2_s v1, v2, v3, v4, v5, v6
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
sqrshrun_b 6, v6
st_h \d_strd, v6, 4
ret
@@ -1655,7 +1723,7 @@ L(\type\()_8tap_v):
interleave_1_h v7, v16, v17, v18, v19
interleave_2_s v5, v6, v7, v16, v17, v18
uxtl_b v5, v6, v7, v16
- mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 4
b.le 0f
@@ -1673,7 +1741,7 @@ L(\type\()_8tap_v):
load_h \sr2, \src, \s_strd, v16, v17
interleave_1_h v7, v16, v17
uxtl_b v5, v6, v7, v16
- mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_b 6, v30
st_h \d_strd, v30, 2
0:
@@ -1698,13 +1766,13 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4
- mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
shift_store_4 \type, \d_strd, v6
b.le 0f
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
uxtl_b v5, v6
- mul_mla_4 v7, v3, v4, v5, v6, .8h
+ mul_mla_4tap v7, v3, v4, v5, v6, .8h
shift_store_4 \type, \d_strd, v7
0:
ret
@@ -1729,28 +1797,28 @@ L(\type\()_8tap_v):
load_s \sr2, \src, \s_strd, v23, v24, v25, v26
interleave_1_s v22, v23, v24, v25, v26
uxtl_b v22, v23, v24, v25
- mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
shift_store_4 \type, \d_strd, v1, v2
b.le 0f
load_s \sr2, \src, \s_strd, v27, v16
subs \h, \h, #2
interleave_1_s v26, v27, v16
uxtl_b v26, v27
- mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
shift_store_4 \type, \d_strd, v1
b.le 0f
load_s \sr2, \src, \s_strd, v17, v18
subs \h, \h, #2
interleave_1_s v16, v17, v18
uxtl_b v16, v17
- mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
shift_store_4 \type, \d_strd, v2
b.le 0f
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v19, v20, v21, v22
interleave_1_s v18, v19, v20, v21, v22
uxtl_b v18, v19, v20, v21
- mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b.gt 48b
0:
@@ -1773,14 +1841,14 @@ L(\type\()_8tap_v):
load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
uxtl_b v1, v2, v3, v4, v5
- mul_mla_4 v6, v1, v2, v3, v4, .8h
- mul_mla_4 v7, v2, v3, v4, v5, .8h
+ mul_mla_4tap v6, v1, v2, v3, v4, .8h
+ mul_mla_4tap v7, v2, v3, v4, v5, .8h
shift_store_8 \type, \d_strd, v6, v7
b.le 0f
load_8b \sr2, \src, \s_strd, v6, v7
uxtl_b v6, v7
- mul_mla_4 v1, v3, v4, v5, v6, .8h
- mul_mla_4 v2, v4, v5, v6, v7, .8h
+ mul_mla_4tap v1, v3, v4, v5, v6, .8h
+ mul_mla_4tap v2, v4, v5, v6, v7, .8h
shift_store_8 \type, \d_strd, v1, v2
0:
ret
@@ -1809,32 +1877,32 @@ L(\type\()_8tap_v):
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v23, v24
uxtl_b v23, v24
- mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v25, v26
uxtl_b v25, v26
- mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
- mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
- mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
uxtl_b v19, v20, v21, v22
- mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
- mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.gt 88b
9:
@@ -1882,10 +1950,10 @@ L(\type\()_8tap_v):
uxtl2 v25.8h, v3.16b
uxtl2 v26.8h, v4.16b
uxtl2 v27.8h, v5.16b
- mul_mla_4 v1, v16, v17, v18, v19, .8h
- mul_mla_4 v16, v17, v18, v19, v20, .8h
- mul_mla_4 v2, v23, v24, v25, v26, .8h
- mul_mla_4 v17, v24, v25, v26, v27, .8h
+ mul_mla_4tap v1, v16, v17, v18, v19, .8h
+ mul_mla_4tap v16, v17, v18, v19, v20, .8h
+ mul_mla_4tap v2, v23, v24, v25, v26, .8h
+ mul_mla_4tap v17, v24, v25, v26, v27, .8h
shift_store_16 \type, \d_strd, v1, v2, v16, v17
b.le 0f
load_16b \sr2, \src, \s_strd, v6, v7
@@ -1893,25 +1961,25 @@ L(\type\()_8tap_v):
uxtl v22.8h, v7.8b
uxtl2 v28.8h, v6.16b
uxtl2 v29.8h, v7.16b
- mul_mla_4 v1, v18, v19, v20, v21, .8h
- mul_mla_4 v3, v19, v20, v21, v22, .8h
- mul_mla_4 v2, v25, v26, v27, v28, .8h
- mul_mla_4 v4, v26, v27, v28, v29, .8h
+ mul_mla_4tap v1, v18, v19, v20, v21, .8h
+ mul_mla_4tap v3, v19, v20, v21, v22, .8h
+ mul_mla_4tap v2, v25, v26, v27, v28, .8h
+ mul_mla_4tap v4, v26, v27, v28, v29, .8h
shift_store_16 \type, \d_strd, v1, v2, v3, v4
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
@@ -1920,7 +1988,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x10, \my, uxtw #3
- adr x9, L(\type\()_8tap_hv_tbl)
+ adr x9, L(\type\()_\taps\()_hv_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
br x9
@@ -1952,13 +2020,13 @@ L(\type\()_8tap_hv):
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@ L(\type\()_8tap_hv):
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
trn1 v16.2s, v16.2s, v28.2s
mov v17.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+ smull v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@ L(\type\()_8tap_hv):
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -2036,7 +2113,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v28.8b}, [\sr2], \s_strd
ld1 {v30.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
@@ -2083,12 +2160,12 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v30.4h, v0.h[3]
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
@@ -2121,8 +2198,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #1
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@ L(\type\()_8tap_filter_2):
mla v31.4h, v28.4h, v0.h[1]
mla v31.4h, v29.4h, v0.h[2]
mla v31.4h, v30.4h, v0.h[3]
+.ifc \taps, 6tap
+ srshr v18.4h, v31.4h, #2
+.else
srshr v16.4h, v31.4h, #2
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v28.8b
mov v18.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v28.8b
mov v20.8b, v29.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v28.8b
mov v22.8b, v29.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal v2.4s, v28.4h, v1.h[6]
+ smull v3.4s, v19.4h, v1.h[1]
+ smlal v3.4s, v20.4h, v1.h[2]
+ smlal v3.4s, v21.4h, v1.h[3]
+ smlal v3.4s, v22.4h, v1.h[4]
+ smlal v3.4s, v28.4h, v1.h[5]
+ smlal v3.4s, v29.4h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@ L(\type\()_8tap_filter_2):
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
subs \h, \h, #2
@@ -2182,8 +2283,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.4h}, [\ds2], \d_strd
.endif
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2193,7 +2296,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v26.8b}, [\sr2], \s_strd
ld1 {v27.8b}, [\src], \s_strd
uxtl v26.8h, v26.8b
@@ -2237,15 +2340,15 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #3
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2316,21 +2421,52 @@ L(\type\()_8tap_filter_4):
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
- bl L(\type\()_8tap_filter_8_first)
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+ mov v18.16b, v16.16b
+.else
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v24.16b
mov v18.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v24.16b
mov v20.16b, v25.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v24.16b
mov v22.16b, v25.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal2 v5.4s, v24.8h, v1.h[5]
+ smlal v2.4s, v24.4h, v1.h[6]
+ smlal2 v3.4s, v24.8h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[6]
+ smlal2 v5.4s, v25.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v24.8h, v1.h[7]
smlal v4.4s, v25.4h, v1.h[7]
smlal2 v5.4s, v25.8h, v1.h[7]
+.endif
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn2 v2.8h, v3.4s, #\shift_hv
sqrshrn v4.4h, v4.4s, #\shift_hv
@@ -2376,8 +2513,10 @@ L(\type\()_8tap_filter_4):
st1 {v4.8h}, [\ds2], \d_strd
.endif
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2399,14 +2538,32 @@ L(\type\()_8tap_filter_4):
.else
add \dst, \dst, #16
.endif
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
ld1 {v28.8b, v29.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
+.ifc \taps, 6tap
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mul v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+.else // 8tap
mul v16.8h, v28.8h, v0.h[0]
ext v24.16b, v28.16b, v29.16b, #(2*1)
ext v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@ L(\type\()_8tap_filter_8_first):
mla v16.8h, v24.8h, v0.h[5]
mla v16.8h, v25.8h, v0.h[6]
mla v16.8h, v26.8h, v0.h[7]
+.endif
srshr v16.8h, v16.8h, #2
ret
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
ld1 {v30.8b, v31.8b}, [\src], \s_strd
uxtl v28.8h, v28.8b
uxtl v29.8h, v29.8b
uxtl v30.8h, v30.8b
uxtl v31.8h, v31.8b
+.ifc \taps, 6tap
+ ext v26.16b, v28.16b, v29.16b, #2
+ ext v27.16b, v30.16b, v31.16b, #2
+ mul v24.8h, v26.8h, v0.h[1]
+ mul v25.8h, v27.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+.else // 8tap
mul v24.8h, v28.8h, v0.h[0]
mul v25.8h, v30.8h, v0.h[0]
.irpc i, 1234567
@@ -2440,22 +2610,25 @@ L(\type\()_8tap_filter_8):
mla v24.8h, v26.8h, v0.h[\i]
mla v25.8h, v27.8h, v0.h[\i]
.endr
+.endif
srshr v24.8h, v24.8h, #2
srshr v25.8h, v25.8h, #2
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
function \type\()_bilin_8bpc_neon, export=1
dup v1.16b, \mx
dup v3.16b, \my
@@ -2987,8 +3160,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6, 6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
index 1bfb12ebb3..576fab158a 100644
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@ endfunc
sub \r3\wd, \r3\wd, \c\wd
.endif
.endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
smlal \d\().4s, \s3\().4h, v0.h[3]
.endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
smlal2 \d\().4s, \s3\().8h, v0.h[3]
.endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull \d\().4s, \s0\().4h, v0.h[0]
smlal \d\().4s, \s1\().4h, v0.h[1]
smlal \d\().4s, \s2\().4h, v0.h[2]
@@ -1396,7 +1412,7 @@ endfunc
smlal \d\().4s, \s6\().4h, v0.h[6]
smlal \d\().4s, \s7\().4h, v0.h[7]
.endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
smull2 \d\().4s, \s0\().8h, v0.h[0]
smlal2 \d\().4s, \s1\().8h, v0.h[1]
smlal2 \d\().4s, \s2\().8h, v0.h[2]
@@ -1499,11 +1515,11 @@ endfunc
st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
.endm
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
function \op\()_8tap_\type\()_16bpc_neon, export=1
mov w9, \type_h
mov w10, \type_v
- b \op\()_8tap_neon
+ b \op\()_\taps\()_neon
endfunc
.endm
@@ -1512,18 +1528,8 @@ endfunc
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular, REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp, REGULAR, SHARP
-make_8tap_fn \type, smooth, SMOOTH, SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
-make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
-make_8tap_fn \type, sharp, SHARP, SHARP
-make_8tap_fn \type, sharp_regular, SHARP, REGULAR
-make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
.ifc \bdmax, w8
ldr w8, [sp]
.endif
@@ -1547,12 +1553,12 @@ function \type\()_8tap_neon
add w13, w12, \bdmax // 6 + intermediate_bits
sub w12, w12, \bdmax // 6 - intermediate_bits
movrel x11, X(mc_subpel_filters), -8
- b.ne L(\type\()_8tap_h)
+ b.ne L(\type\()_\taps\()_h)
tst \my, #(0x7f << 14)
- b.ne L(\type\()_8tap_v)
+ b.ne L(\type\()_\taps\()_v)
b \type\()_neon
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
cmp \w, #4
ubfx w10, \mx, #7, #7
and \mx, \mx, #0x7f
@@ -1561,9 +1567,9 @@ L(\type\()_8tap_h):
4:
tst \my, #(0x7f << 14)
add \xmx, x11, \mx, uxtw #3
- b.ne L(\type\()_8tap_hv)
+ b.ne L(\type\()_\taps\()_hv)
- adr x10, L(\type\()_8tap_h_tbl)
+ adr x10, L(\type\()_\taps\()_h_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@ L(\type\()_8tap_h):
mov \mx, \w
8:
+.ifc \taps, 6tap
+ ext v24.16b, v16.16b, v17.16b, #2
+ ext v25.16b, v20.16b, v21.16b, #2
+ smull v18.4s, v24.4h, v0.h[1]
+ smull2 v19.4s, v24.8h, v0.h[1]
+ smull v22.4s, v25.4h, v0.h[1]
+ smull2 v23.4s, v25.8h, v0.h[1]
+.irpc i, 23456
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v18.4s, v16.4h, v0.h[0]
smull2 v19.4s, v16.8h, v0.h[0]
smull v22.4s, v20.4h, v0.h[0]
@@ -1694,6 +1716,7 @@ L(\type\()_8tap_h):
smlal v22.4s, v25.4h, v0.h[\i]
smlal2 v23.4s, v25.8h, v0.h[\i]
.endr
+.endif
subs \mx, \mx, #8
srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@ L(\type\()_8tap_h):
b.gt 81b
ret
-L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
+L(\type\()_\taps\()_h_tbl):
+ .hword L(\type\()_\taps\()_h_tbl) - 1280b
+ .hword L(\type\()_\taps\()_h_tbl) - 640b
+ .hword L(\type\()_\taps\()_h_tbl) - 320b
+ .hword L(\type\()_\taps\()_h_tbl) - 160b
+ .hword L(\type\()_\taps\()_h_tbl) - 80b
+ .hword L(\type\()_\taps\()_h_tbl) - 40b
+ .hword L(\type\()_\taps\()_h_tbl) - 20b
.hword 0
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -1758,7 +1781,7 @@ L(\type\()_8tap_v):
dup v30.4s, w12 // 6 - intermediate_bits
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- adr x10, L(\type\()_8tap_v_tbl)
+ adr x10, L(\type\()_\taps\()_v_tbl)
ldrh w9, [x10, x9, lsl #1]
.ifc \type, prep
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@ L(\type\()_8tap_v):
load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
interleave_1_s v1, v2, v3, v4, v5
b.gt 24f
- smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4tap v6, v1, v2, v3, v4
sqrshrun_h 6, v6
umin_h v31, .8h, v6
st_s \d_strd, v6, 2
@@ -1794,8 +1817,8 @@ L(\type\()_8tap_v):
24: // 2x4 v
load_s \sr2, \src, \s_strd, v6, v7
interleave_1_s v5, v6, v7
- smull_smlal_4 v16, v1, v2, v3, v4
- smull_smlal_4 v17, v3, v4, v5, v6
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull_smlal_4tap v17, v3, v4, v5, v6
sqrshrun_h 6, v16, v17
umin_h v31, .8h, v16
st_s \d_strd, v16, 4
@@ -1817,8 +1840,8 @@ L(\type\()_8tap_v):
subs \h, \h, #4
load_s \sr2, \src, \s_strd, v16, v17, v18, v19
interleave_1_s v7, v16, v17, v18, v19
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
- smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18
sqrshrun_h 6, v24, v25
umin_h v31, .8h, v24
st_s \d_strd, v24, 4
@@ -1836,7 +1859,7 @@ L(\type\()_8tap_v):
26:
load_s \sr2, \src, \s_strd, v16, v17
interleave_1_s v7, v16, v17
- smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16
sqrshrun_h 6, v24
umin_h v31, .4h, v24
st_s \d_strd, v24, 2
@@ -1860,13 +1883,13 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v6, v1, v2, v3, v4
- smull_smlal_4 v7, v2, v3, v4, v5
+ smull_smlal_4tap v6, v1, v2, v3, v4
+ smull_smlal_4tap v7, v2, v3, v4, v5
shift_store_4 \type, \d_strd, v6, v7
b.le 0f
load_4h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v1, v3, v4, v5, v6
- smull_smlal_4 v2, v4, v5, v6, v7
+ smull_smlal_4tap v1, v3, v4, v5, v6
+ smull_smlal_4tap v2, v4, v5, v6, v7
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1885,10 +1908,10 @@ L(\type\()_8tap_v):
48:
subs \h, \h, #4
load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
- smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_4 \type, \d_strd, v1, v2, v3, v4
b.le 0f
cmp \h, #2
@@ -1903,8 +1926,8 @@ L(\type\()_8tap_v):
b 48b
46:
load_4h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_4 \type, \d_strd, v1, v2
0:
ret
@@ -1925,17 +1948,17 @@ L(\type\()_8tap_v):
sxtl v0.8h, v0.8b
load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
- smull_smlal_4 v16, v1, v2, v3, v4
- smull2_smlal2_4 v17, v1, v2, v3, v4
- smull_smlal_4 v18, v2, v3, v4, v5
- smull2_smlal2_4 v19, v2, v3, v4, v5
+ smull_smlal_4tap v16, v1, v2, v3, v4
+ smull2_smlal2_4tap v17, v1, v2, v3, v4
+ smull_smlal_4tap v18, v2, v3, v4, v5
+ smull2_smlal2_4tap v19, v2, v3, v4, v5
shift_store_8 \type, \d_strd, v16, v17, v18, v19
b.le 0f
load_8h \sr2, \src, \s_strd, v6, v7
- smull_smlal_4 v16, v3, v4, v5, v6
- smull2_smlal2_4 v17, v3, v4, v5, v6
- smull_smlal_4 v18, v4, v5, v6, v7
- smull2_smlal2_4 v19, v4, v5, v6, v7
+ smull_smlal_4tap v16, v3, v4, v5, v6
+ smull2_smlal2_4tap v17, v3, v4, v5, v6
+ smull_smlal_4tap v18, v4, v5, v6, v7
+ smull2_smlal2_4tap v19, v4, v5, v6, v7
shift_store_8 \type, \d_strd, v16, v17, v18, v19
0:
ret
@@ -1962,18 +1985,18 @@ L(\type\()_8tap_v):
88:
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v23, v24
- smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
- smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
- smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
- smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
subs \h, \h, #2
load_8h \sr2, \src, \s_strd, v25, v26
- smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
- smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
- smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
- smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v1, v2, v3, v4
b.le 9f
mov v16.16b, v20.16b
@@ -2013,10 +2036,10 @@ L(\type\()_8tap_v):
16:
load_16h \src, \src, \s_strd, v22, v23
subs \h, \h, #1
- smull_smlal_4 v1, v16, v18, v20, v22
- smull2_smlal2_4 v2, v16, v18, v20, v22
- smull_smlal_4 v3, v17, v19, v21, v23
- smull2_smlal2_4 v4, v17, v19, v21, v23
+ smull_smlal_4tap v1, v16, v18, v20, v22
+ smull2_smlal2_4tap v2, v16, v18, v20, v22
+ smull_smlal_4tap v3, v17, v19, v21, v23
+ smull2_smlal2_4tap v4, v17, v19, v21, v23
shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
b.le 0f
mov v16.16b, v18.16b
@@ -2029,17 +2052,17 @@ L(\type\()_8tap_v):
0:
ret
-L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
+L(\type\()_\taps\()_v_tbl):
+ .hword L(\type\()_\taps\()_v_tbl) - 1280b
+ .hword L(\type\()_\taps\()_v_tbl) - 640b
+ .hword L(\type\()_\taps\()_v_tbl) - 320b
+ .hword L(\type\()_\taps\()_v_tbl) - 160b
+ .hword L(\type\()_\taps\()_v_tbl) - 80b
+ .hword L(\type\()_\taps\()_v_tbl) - 40b
+ .hword L(\type\()_\taps\()_v_tbl) - 20b
.hword 0
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
cmp \h, #4
ubfx w10, \my, #7, #7
and \my, \my, #0x7f
@@ -2048,7 +2071,7 @@ L(\type\()_8tap_hv):
4:
add \xmy, x11, \my, uxtw #3
- adr x10, L(\type\()_8tap_hv_tbl)
+ adr x10, L(\type\()_\taps\()_hv_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
ldrh w9, [x10, x9, lsl #1]
neg v30.4s, v30.4s // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@ L(\type\()_8tap_hv):
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
// The intermediates from the horizontal pass fit in 16 bit without
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@ L(\type\()_8tap_hv):
mov v17.8b, v24.8b
2:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
smull v2.4s, v16.4h, v1.h[0]
@@ -2143,20 +2166,28 @@ L(\type\()_8tap_hv):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
xtn v16.4h, v16.4s
trn1 v16.2s, v16.2s, v24.2s
mov v17.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v18.8b, v17.8b, v24.8b, #4
mov v19.8b, v24.8b
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v20.8b, v19.8b, v24.8b, #4
mov v21.8b, v24.8b
28:
- bl L(\type\()_8tap_filter_2)
+ bl L(\type\()_\taps\()_filter_2)
ext v22.8b, v21.8b, v24.8b, #4
+.ifc \taps, 6tap
+ smull v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2165,6 +2196,7 @@ L(\type\()_8tap_hv):
smlal v3.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[6]
smlal v3.4s, v24.4h, v1.h[7]
+.endif
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2184,7 +2216,7 @@ L(\type\()_8tap_hv):
0:
ret x15
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
ld1 {v25.8h}, [\sr2], \s_strd
ld1 {v27.8h}, [\src], \s_strd
ext v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@ L(\type\()_8tap_filter_2):
// (at the cost of a smaller slowdown on in-order cores such as A53).
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
4:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -2272,8 +2304,13 @@ L(\type\()_8tap_filter_2):
480: // 4x8, 4x16, 4x32 hv
ld1 {v1.8b}, [\xmy]
sub \src, \src, #2
+.ifc \taps, 6tap
+ sub \sr2, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+.else
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
+.endif
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@ L(\type\()_8tap_filter_2):
// any bias; we could just as well keep them as .4s, but narrowing
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+ xtn v18.4h, v16.4s
+.else
xtn v16.4h, v16.4s
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v17.8b, v24.8b
mov v18.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+.endif
+ bl L(\type\()_\taps\()_filter_4)
mov v19.8b, v24.8b
mov v20.8b, v25.8b
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
mov v21.8b, v24.8b
mov v22.8b, v25.8b
48:
- bl L(\type\()_8tap_filter_4)
+ bl L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+ smull v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v24.4h, v1.h[6]
+ smull v4.4s, v19.4h, v1.h[1]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal v4.4s, v24.4h, v1.h[5]
+ smlal v4.4s, v25.4h, v1.h[6]
+.else // 8tap
smull v3.4s, v16.4h, v1.h[0]
smlal v3.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[2]
@@ -2324,6 +2379,7 @@ L(\type\()_8tap_filter_2):
smlal v4.4s, v22.4h, v1.h[5]
smlal v4.4s, v24.4h, v1.h[6]
smlal v4.4s, v25.4h, v1.h[7]
+.endif
.ifc \type, put
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@ L(\type\()_8tap_filter_2):
st1 {v3.d}[0], [\dst], \d_strd
st1 {v3.d}[1], [\ds2], \d_strd
b.le 0f
+.ifc \taps, 8tap
mov v16.8b, v18.8b
mov v17.8b, v19.8b
+.endif
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
@@ -2350,7 +2408,7 @@ L(\type\()_8tap_filter_2):
0:
ret x15
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
ld1 {v24.8h}, [\sr2], \s_strd
ld1 {v25.8h}, [\src], \s_strd
ext v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@ L(\type\()_8tap_filter_4):
// and conserves register space (no need to clobber v8-v15).
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
8:
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2480,7 +2538,9 @@ L(\type\()_8tap_filter_4):
ld1 {v0.8b}, [\xmx]
ld1 {v1.8b}, [\xmy]
sub \src, \src, #6
+.ifc \taps, 8tap
sub \src, \src, \s_strd
+.endif
sub \src, \src, \s_strd, lsl #1
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
@@ -2494,6 +2554,16 @@ L(\type\()_8tap_filter_4):
lsl \s_strd, \s_strd, #1
ld1 {v27.8h, v28.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v26.16b, v27.16b, v28.16b, #2
+ smull v24.4s, v26.4h, v0.h[1]
+ smull2 v25.4s, v26.8h, v0.h[1]
+.irpc i, 23456
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v24.4s, v27.4h, v0.h[0]
smull2 v25.4s, v27.8h, v0.h[0]
.irpc i, 1234567
@@ -2501,6 +2571,7 @@ L(\type\()_8tap_filter_4):
smlal v24.4s, v26.4h, v0.h[\i]
smlal2 v25.4s, v26.8h, v0.h[\i]
.endr
+.endif
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
// The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@ L(\type\()_8tap_filter_4):
// them to .4h gives a significant speedup on out of order cores
// (at the cost of a smaller slowdown on in-order cores such as A53),
// and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+ uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2
+.else
uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v17.16b, v23.16b
mov v18.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+.endif
+ bl L(\type\()_\taps\()_filter_8)
mov v19.16b, v23.16b
mov v20.16b, v24.16b
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
mov v21.16b, v23.16b
mov v22.16b, v24.16b
88:
+.ifc \taps, 6tap
+ smull v2.4s, v18.4h, v1.h[1]
+ smull2 v3.4s, v18.8h, v1.h[1]
+ bl L(\type\()_\taps\()_filter_8)
+ smull v4.4s, v19.4h, v1.h[1]
+ smull2 v5.4s, v19.8h, v1.h[1]
+ smlal v2.4s, v19.4h, v1.h[2]
+ smlal2 v3.4s, v19.8h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[2]
+ smlal2 v5.4s, v20.8h, v1.h[2]
+ smlal v2.4s, v20.4h, v1.h[3]
+ smlal2 v3.4s, v20.8h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[3]
+ smlal2 v5.4s, v21.8h, v1.h[3]
+ smlal v2.4s, v21.4h, v1.h[4]
+ smlal2 v3.4s, v21.8h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[4]
+ smlal2 v5.4s, v22.8h, v1.h[4]
+ smlal v2.4s, v22.4h, v1.h[5]
+ smlal2 v3.4s, v22.8h, v1.h[5]
+ smlal v4.4s, v23.4h, v1.h[5]
+ smlal2 v5.4s, v23.8h, v1.h[5]
+ smlal v2.4s, v23.4h, v1.h[6]
+ smlal2 v3.4s, v23.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+.else // 8tap
smull v2.4s, v16.4h, v1.h[0]
smull2 v3.4s, v16.8h, v1.h[0]
- bl L(\type\()_8tap_filter_8)
+ bl L(\type\()_\taps\()_filter_8)
smull v4.4s, v17.4h, v1.h[0]
smull2 v5.4s, v17.8h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
@@ -2554,6 +2656,7 @@ L(\type\()_8tap_filter_4):
smlal2 v3.4s, v23.8h, v1.h[7]
smlal v4.4s, v24.4h, v1.h[7]
smlal2 v5.4s, v24.8h, v1.h[7]
+.endif
.ifc \type, put
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@ L(\type\()_8tap_filter_4):
st1 {v2.8h}, [\dst], \d_strd
st1 {v3.8h}, [\ds2], \d_strd
b.le 9f
+.ifc \taps, 8tap
mov v16.16b, v18.16b
mov v17.16b, v19.16b
+.endif
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
@@ -2596,13 +2701,32 @@ L(\type\()_8tap_filter_4):
mov \h, \my
add \src, \src, #16
add \dst, \dst, #16
+.ifc \taps, 6tap
+ add \src, \src, \s_strd, lsl #1
+.endif
b 168b
0:
ret x15
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
ld1 {v6.8h, v7.8h}, [\src], \s_strd
+.ifc \taps, 6tap
+ ext v23.16b, v4.16b, v5.16b, #2
+ ext v24.16b, v6.16b, v7.16b, #2
+ smull v25.4s, v23.4h, v0.h[1]
+ smull2 v26.4s, v23.8h, v0.h[1]
+ smull v27.4s, v24.4h, v0.h[1]
+ smull2 v28.4s, v24.8h, v0.h[1]
+.irpc i, 23456
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+.else // 8tap
smull v25.4s, v4.4h, v0.h[0]
smull2 v26.4s, v4.8h, v0.h[0]
smull v27.4s, v6.4h, v0.h[0]
@@ -2615,6 +2739,7 @@ L(\type\()_8tap_filter_8):
smlal v27.4s, v24.4h, v0.h[\i]
smlal2 v28.4s, v24.8h, v0.h[\i]
.endr
+.endif
srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@ L(\type\()_8tap_filter_8):
uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
-L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
+L(\type\()_\taps\()_hv_tbl):
+ .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+ .hword L(\type\()_\taps\()_hv_tbl) - 640b
+ .hword L(\type\()_\taps\()_hv_tbl) - 320b
+ .hword L(\type\()_\taps\()_hv_tbl) - 160b
+ .hword L(\type\()_\taps\()_hv_tbl) - 80b
+ .hword L(\type\()_\taps\()_hv_tbl) - 40b
+ .hword L(\type\()_\taps\()_hv_tbl) - 20b
.hword 0
endfunc
+.endm
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
function \type\()_bilin_16bpc_neon, export=1
.ifc \bdmax, w8
ldr w8, [sp]
@@ -3236,8 +3363,34 @@ L(\type\()_bilin_hv_tbl):
endfunc
.endm
-filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn put, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn put, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn put, sharp, SHARP, SHARP, 8tap
+make_8tap_fn put, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn put, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn put, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn put, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn put, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn put, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn prep, regular_sharp, REGULAR, SHARP, 8tap
+make_8tap_fn prep, smooth_sharp, SMOOTH, SHARP, 8tap
+make_8tap_fn prep, sharp, SHARP, SHARP, 8tap
+make_8tap_fn prep, sharp_regular, SHARP, REGULAR, 8tap
+make_8tap_fn prep, sharp_smooth, SHARP, SMOOTH, 8tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn prep, regular, REGULAR, REGULAR, 6tap
+make_8tap_fn prep, regular_smooth, REGULAR, SMOOTH, 6tap
+make_8tap_fn prep, smooth, SMOOTH, SMOOTH, 6tap
+make_8tap_fn prep, smooth_regular, SMOOTH, REGULAR, 6tap
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
.macro load_filter_row dst, src, inc
asr w13, \src, #10
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
index 3a6cf900a9..7bef9243fb 100644
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -208,60 +208,66 @@ L(renorm):
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 4f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 6f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+2: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+3: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+4: // end
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
add sp, sp, #48
ret
+
+5: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 3b
+
+6: // refill_eob
+ cmp x3, x4
+ b.hs 5b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 2b
endfunc
function msac_decode_symbol_adapt8_neon, export=1
@@ -334,54 +340,37 @@ function msac_decode_hi_tok_neon, export=1
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
- mvn x7, x7 // ~dif
- add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ sub x7, x7, x3, lsl #48 // dif - (v << 48)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
- lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
dup v3.4h, w4
- mvn x7, x7 // ~dif
- b.hs 9f
+ b.hs 5f
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
- cmp x5, x4
- b.gt 2f
-
- ldr x3, [x3] // next_bits
- add w8, w6, #23 // shift_bits = cnt + 23
- add w6, w6, #16 // cnt += 16
- rev x3, x3 // next_bits = bswap(next_bits)
- sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
- and w8, w8, #24 // shift_bits &= 24
- lsr x3, x3, x8 // next_bits >>= shift_bits
- sub w8, w8, w6 // shift_bits -= 16 + cnt
- str x5, [x0, #BUF_POS]
- lsl x3, x3, x8 // next_bits <<= shift_bits
- mov w4, #48
- sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
- eor x7, x7, x3 // dif ^= next_bits
- b 9f
-
-2: // refill_eob
- mov w14, #40
- sub w5, w14, w6 // c = 40 - cnt
-3:
- cmp x3, x4
- b.ge 4f
- ldrb w8, [x3], #1
- lsl x8, x8, x5
- eor x7, x7, x8
- subs w5, w5, #8
- b.ge 3b
-
-4: // refill_eob_end
+ subs x5, x5, x4
+ b.hi 7f
+
+ ldr x8, [x3] // next_bits
+ add w4, w6, #-48 // shift_bits = cnt + 16 (- 64)
+ mvn x8, x8
+ neg w5, w4
+ rev x8, x8 // next_bits = bswap(next_bits)
+ lsr w5, w5, #3 // num_bytes_read
+ lsr x8, x8, x4 // next_bits >>= (shift_bits & 63)
+
+3: // refill_end
+ add x3, x3, x5
+ add w6, w6, w5, lsl #3 // cnt += num_bits_read
str x3, [x0, #BUF_POS]
- sub w6, w14, w5 // cnt = 40 - c
-9:
+4: // refill_end2
+ orr x7, x7, x8 // dif |= next_bits
+
+5: // end
lsl w15, w15, #1
sub w15, w15, #5
lsr x12, x7, #48
@@ -394,6 +383,29 @@ function msac_decode_hi_tok_neon, export=1
str x7, [x0, #DIF]
lsr w0, w13, #1
ret
+
+6: // pad_with_ones
+ add w8, w6, #-16
+ ror x8, x8, x8
+ b 4b
+
+7: // refill_eob
+ cmp x3, x4
+ b.hs 6b
+
+ ldr x8, [x4, #-8]
+ lsl w5, w5, #3
+ lsr x8, x8, x5
+ add w5, w6, #-48
+ mvn x8, x8
+ sub w4, w4, w3 // num_bytes_left
+ rev x8, x8
+ lsr x8, x8, x5
+ neg w5, w5
+ lsr w5, w5, #3
+ cmp w5, w4
+ csel w5, w5, w4, lo // num_bytes_read
+ b 3b
endfunc
function msac_decode_bool_equi_neon, export=1
@@ -410,7 +422,6 @@ function msac_decode_bool_equi_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -431,7 +442,6 @@ function msac_decode_bool_neon, export=1
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
@@ -455,7 +465,6 @@ function msac_decode_bool_adapt_neon, export=1
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
- mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
index 9013fd4b1e..1b3f319ce5 100644
--- a/third_party/dav1d/src/arm/64/util.S
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -32,6 +32,10 @@
#include "config.h"
#include "src/arm/asm.S"
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
@@ -51,6 +55,10 @@
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
+#elif __has_feature(hwaddress_sanitizer)
+ adrp \rd, :pg_hi21_nc:\val+(\offset)
+ movk \rd, #:prel_g3:\val+0x100000000
+ add \rd, \rd, :lo12:\val+(\offset)
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
@@ -149,6 +157,35 @@
trn2 \r7\().2d, \t9\().2d, \r7\().2d
.endm
+.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \o0\().2d, \r3\().2d, \r4\().2d
+ trn2 \o4\().2d, \r3\().2d, \r4\().2d
+ trn1 \o1\().2d, \r5\().2d, \r6\().2d
+ trn2 \o5\().2d, \r5\().2d, \r6\().2d
+ trn2 \o6\().2d, \t8\().2d, \r2\().2d
+ trn1 \o2\().2d, \t8\().2d, \r2\().2d
+ trn1 \o3\().2d, \t9\().2d, \r7\().2d
+ trn2 \o7\().2d, \t9\().2d, \r7\().2d
+.endm
+
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
@@ -226,4 +263,16 @@
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
+.macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \o0\().4s, \t4\().4s, \t6\().4s
+ trn2 \o2\().4s, \t4\().4s, \t6\().4s
+ trn1 \o1\().4s, \t5\().4s, \t7\().4s
+ trn2 \o3\().4s, \t5\().4s, \t7\().4s
+.endm
+
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
index dc50415f1f..fed73b3048 100644
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -34,6 +34,50 @@
#define x18 do_not_use_x18
#define w18 do_not_use_w18
+#if HAVE_AS_ARCH_DIRECTIVE
+ .arch AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
+#define ENABLE_SVE .arch_extension sve
+#define DISABLE_SVE .arch_extension nosve
+#else
+#define ENABLE_SVE
+#define DISABLE_SVE
+#endif
+#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
+#define ENABLE_SVE2 .arch_extension sve2
+#define DISABLE_SVE2 .arch_extension nosve2
+#else
+#define ENABLE_SVE2
+#define DISABLE_SVE2
+#endif
+
+/* If we do support the .arch_extension directives, disable support for all
+ * the extensions that we may use, in case they were implicitly enabled by
+ * the .arch level. This makes it clear if we try to assemble an instruction
+ * from an unintended extension set; we only allow assmbling such instructions
+ * within regions where we explicitly enable those extensions. */
+DISABLE_DOTPROD
+DISABLE_I8MM
+DISABLE_SVE
+DISABLE_SVE2
+
+
/* Support macros for
* - Armv8.3-A Pointer Authentication and
* - Armv8.5-A Branch Target Identification
diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c
index b7a0d3adbc..d9b1751a6a 100644
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -31,22 +31,95 @@
#include "src/arm/cpu.h"
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
-// NEON is always available; runtime tests are not needed.
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
#include <sys/auxv.h>
+#if ARCH_AARCH64
+
+#define HWCAP_AARCH64_ASIMDDP (1 << 20)
+#define HWCAP_AARCH64_SVE (1 << 22)
+#define HWCAP2_AARCH64_SVE2 (1 << 1)
+#define HWCAP2_AARCH64_I8MM (1 << 13)
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ unsigned long hw_cap2 = getauxval(AT_HWCAP2);
+#else
+ unsigned long hw_cap = 0;
+ unsigned long hw_cap2 = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
+#endif
+
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+ return flags;
+}
+#else /* !ARCH_AARCH64 */
+
#ifndef HWCAP_ARM_NEON
-#define HWCAP_ARM_NEON (1 << 12)
+#define HWCAP_ARM_NEON (1 << 12)
#endif
-#define NEON_HWCAP HWCAP_ARM_NEON
+#define HWCAP_ARM_ASIMDDP (1 << 24)
+#define HWCAP_ARM_I8MM (1 << 27)
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
-#include <sys/auxv.h>
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+
+ unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+ return flags;
+}
+#endif /* ARCH_AARCH64 */
+
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+
+static int have_feature(const char *feature) {
+ int supported = 0;
+ size_t size = sizeof(supported);
+ if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) {
+ return 0;
+ }
+ return supported;
+}
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+ if (have_feature("hw.optional.arm.FEAT_DotProd"))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+ if (have_feature("hw.optional.arm.FEAT_I8MM"))
+ flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+ /* No SVE and SVE2 feature detection available on Apple platforms. */
+ return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
-#define NEON_HWCAP HWCAP_NEON
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+ if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+ /* No I8MM or SVE feature detection available on Windows at the time of
+ * writing. */
+ return flags;
+}
#elif defined(__ANDROID__)
+#include <ctype.h>
#include <stdio.h>
#include <string.h>
@@ -58,18 +131,25 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
char line_buffer[120];
const char *line;
+ size_t flaglen = strlen(flag);
while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
- if (strstr(line, flag)) {
- fclose(file);
- return 1;
+ // check all occurances as whole words
+ const char *found = line;
+ while ((found = strstr(found, flag))) {
+ if ((found == line_buffer || !isgraph(found[-1])) &&
+ (isspace(found[flaglen]) || feof(file))) {
+ fclose(file);
+ return 1;
+ }
+ found += flaglen;
}
// if line is incomplete seek back to avoid splitting the search
// string into two buffers
- if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ if (!strchr(line, '\n') && strlen(line) > flaglen) {
// use fseek since the 64 bit fseeko is only available since
// Android API level 24 and meson defines _FILE_OFFSET_BITS
// by default 64
- if (fseek(file, -strlen(flag), SEEK_CUR))
+ if (fseek(file, -flaglen, SEEK_CUR))
break;
}
}
@@ -78,22 +158,23 @@ static unsigned parse_proc_cpuinfo(const char *flag) {
return 0;
}
-#endif
COLD unsigned dav1d_get_cpu_flags_arm(void) {
- unsigned flags = 0;
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
- flags |= DAV1D_ARM_CPU_FLAG_NEON;
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
- unsigned long hw_cap = getauxval(AT_HWCAP);
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
- unsigned long hw_cap = 0;
- elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
- flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(__ANDROID__)
- flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#endif
-
+ unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+ flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+ flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+#if ARCH_AARCH64
+ flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+ flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+#endif /* ARCH_AARCH64 */
return flags;
}
+
+#else /* Unsupported OS */
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ return 0;
+}
+
+#endif
diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h
index 8c10a1b6b0..de9bde6ccf 100644
--- a/third_party/dav1d/src/arm/cpu.h
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -30,6 +30,10 @@
enum CpuFlags {
DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+ DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1,
+ DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2,
+ DAV1D_ARM_CPU_FLAG_SVE = 1 << 3,
+ DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4,
};
unsigned dav1d_get_cpu_flags_arm(void);
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
index 2ecd086b3b..17234e027a 100644
--- a/third_party/dav1d/src/arm/itx.h
+++ b/third_party/dav1d/src/arm/itx.h
@@ -117,9 +117,11 @@ static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+ assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon);
+
if (BITDEPTH == 16 && bpc != 10) return;
- assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
assign_itx16_fn(R, 8, 4, neon);
diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h
index 9db0bf86ae..6eee0da424 100644
--- a/third_party/dav1d/src/arm/msac.h
+++ b/third_party/dav1d/src/arm/msac.h
@@ -39,7 +39,7 @@ unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
-#if ARCH_AARCH64 || defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h
index c9009c7778..d20c5f0168 100644
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@@ -64,6 +64,20 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
flags |= DAV1D_ARM_CPU_FLAG_NEON;
#endif
+#ifdef __ARM_FEATURE_DOTPROD
+ flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+#ifdef __ARM_FEATURE_MATMUL_INT8
+ flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
+#if ARCH_AARCH64
+#ifdef __ARM_FEATURE_SVE
+ flags |= DAV1D_ARM_CPU_FLAG_SVE;
+#endif
+#ifdef __ARM_FEATURE_SVE2
+ flags |= DAV1D_ARM_CPU_FLAG_SVE2;
+#endif
+#endif /* ARCH_AARCH64 */
#elif ARCH_PPC64LE
#if defined(__VSX__)
flags |= DAV1D_PPC_CPU_FLAG_VSX;
diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm
index 68b1f74f4b..d2bd758e67 100644
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x86 abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2022 x264 project
+;* Copyright (C) 2005-2024 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Henrik Gramner <henrik@gramner.com>
@@ -104,7 +104,7 @@
%endif
%define HAVE_PRIVATE_EXTERN 1
-%ifdef __NASM_VER__
+%ifdef __NASM_VERSION_ID__
%use smartalign
%if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
%define HAVE_PRIVATE_EXTERN 0
@@ -386,7 +386,24 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endif
%endmacro
-%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
+%macro RESET_STACK_STATE 0
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset - stack_size_padded
+ %else
+ %xdefine rstk rsp
+ %endif
+ %assign stack_size 0
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+%endmacro
+
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs
+ RESET_STACK_STATE
+ %ifnum %2
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %endif
+ %endif
%ifnum %1
%if %1 != 0
%assign %%pad 0
@@ -396,11 +413,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endif
%if WIN64
%assign %%pad %%pad + 32 ; shadow space
- %if mmsize != 8
- %assign xmm_regs_used %2
- %if xmm_regs_used > 8
- %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
- %endif
+ %if xmm_regs_used > 8
+ %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
%endif
%endif
%if required_stack_alignment <= STACK_ALIGNMENT
@@ -496,35 +510,62 @@ DECLARE_REG 14, R13, 120
%endif
%endmacro
-%macro WIN64_PUSH_XMM 0
- ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
- %if xmm_regs_used > 6 + high_mm_regs
- movaps [rstk + stack_offset + 8], xmm6
- %endif
- %if xmm_regs_used > 7 + high_mm_regs
- movaps [rstk + stack_offset + 24], xmm7
- %endif
- %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
- %if %%xmm_regs_on_stack > 0
- %assign %%i 8
- %rep %%xmm_regs_on_stack
- movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
- %assign %%i %%i+1
- %endrep
+; Push XMM registers to the stack. If no argument is specified all used register
+; will be pushed, otherwise only push previously unpushed registers.
+%macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed
+ %if mmsize != 8
+ %if %0 == 2
+ %assign %%pushed %2
+ %assign xmm_regs_used %1
+ %elif %0 == 1
+ %assign %%pushed xmm_regs_used
+ %assign xmm_regs_used %1
+ %else
+ %assign %%pushed 0
+ %endif
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %assign %%pushed %%pushed - high_mm_regs - 8
+ %if %%pushed < 0
+ %assign %%pushed 0
+ %endif
+ %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8
+ %if %%regs_to_push > 0
+ ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32
+ %assign %%i %%pushed + 8
+ %rep %%regs_to_push
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
%endif
%endmacro
-%macro WIN64_SPILL_XMM 1
- %assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16 + high_mm_regs
- %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
- %if %%xmm_regs_on_stack > 0
- ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
- %assign %%pad %%xmm_regs_on_stack*16 + 32
- %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
- SUB rsp, stack_size_padded
+; Allocated stack space for XMM registers and push all, or a subset, of those
+%macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved
+ RESET_STACK_STATE
+ %if mmsize != 8
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %if %0 == 2
+ ASSERT %2 >= %1
+ %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8
+ %else
+ %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8
+ %endif
+ %if %%xmm_regs_on_stack > 0
+ ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
+ %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %endif
+ WIN64_PUSH_XMM
%endif
- WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 0
@@ -555,9 +596,7 @@ DECLARE_REG 14, R13, 120
%macro WIN64_RESTORE_XMM 0
WIN64_RESTORE_XMM_INTERNAL
- %assign stack_offset (stack_offset-stack_size_padded)
- %assign stack_size_padded 0
- %assign xmm_regs_used 0
+ RESET_STACK_STATE
%endmacro
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
@@ -592,12 +631,11 @@ DECLARE_REG 14, R13, 72
%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
- %assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14
- ALLOC_STACK %4
+ ALLOC_STACK %4, %3
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
%if %0 > 4
%ifnum %4
@@ -661,7 +699,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
SETUP_STACK_POINTER %4
ASSERT regs_used <= 7
PUSH_IF_USED 3, 4, 5, 6
- ALLOC_STACK %4
+ ALLOC_STACK %4, %3
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
%if %0 > 4
%ifnum %4
@@ -694,13 +732,19 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif ;======================================================================
%if WIN64 == 0
- %macro WIN64_SPILL_XMM 1
- %assign xmm_regs_used %1
+ %macro WIN64_SPILL_XMM 1-2
+ RESET_STACK_STATE
+ %if mmsize != 8
+ %assign xmm_regs_used %1
+ %endif
%endmacro
%macro WIN64_RESTORE_XMM 0
- %assign xmm_regs_used 0
+ RESET_STACK_STATE
%endmacro
- %macro WIN64_PUSH_XMM 0
+ %macro WIN64_PUSH_XMM 0-2
+ %if mmsize != 8 && %0 >= 1
+ %assign xmm_regs_used %1
+ %endif
%endmacro
%endif
@@ -845,9 +889,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%1: %2
%endmacro
-; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
+ ; The GNU linker assumes the stack is executable by default.
[SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+
+ %ifdef __NASM_VERSION_ID__
+ %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
+ %if ARCH_X86_64
+ ; Control-flow Enforcement Technology (CET) properties.
+ [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
+ dd 0x00000004 ; n_namesz
+ dd gprsize + 8 ; n_descsz
+ dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0
+ db "GNU",0 ; n_name
+ dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
+ dd 0x00000004 ; pr_datasz
+ dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
+ dd 0x00000000 ; pr_padding
+ %endif
+ %endif
+ %endif
%endif
; Tell debuggers how large the function was.
@@ -883,21 +944,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
%assign cpuflags_aesni (1<<12) | cpuflags_sse42
-%assign cpuflags_gfni (1<<13) | cpuflags_sse42
-%assign cpuflags_avx (1<<14) | cpuflags_sse42
-%assign cpuflags_xop (1<<15) | cpuflags_avx
-%assign cpuflags_fma4 (1<<16) | cpuflags_avx
-%assign cpuflags_fma3 (1<<17) | cpuflags_avx
-%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
-%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
-%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
-
-%assign cpuflags_cache32 (1<<23)
-%assign cpuflags_cache64 (1<<24)
-%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<26)
+%assign cpuflags_clmul (1<<13) | cpuflags_sse42
+%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul
+%assign cpuflags_avx (1<<15) | cpuflags_sse42
+%assign cpuflags_xop (1<<16) | cpuflags_avx
+%assign cpuflags_fma4 (1<<17) | cpuflags_avx
+%assign cpuflags_fma3 (1<<18) | cpuflags_avx
+%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1
+%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32 (1<<24)
+%assign cpuflags_cache64 (1<<25)
+%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<27)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -939,13 +1001,13 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%if ARCH_X86_64 || cpuflag(sse2)
- %ifdef __NASM_VER__
+ %ifdef __NASM_VERSION_ID__
ALIGNMODE p6
%else
CPU amdnop
%endif
%else
- %ifdef __NASM_VER__
+ %ifdef __NASM_VERSION_ID__
ALIGNMODE nop
%else
CPU basicnop
@@ -1035,6 +1097,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%if WIN64
AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
%endif
+ %xdefine bcstw 1to8
%xdefine bcstd 1to4
%xdefine bcstq 1to2
%endmacro
@@ -1050,6 +1113,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS ymm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to16
%xdefine bcstd 1to8
%xdefine bcstq 1to4
%endmacro
@@ -1065,6 +1129,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS zmm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to32
%xdefine bcstd 1to16
%xdefine bcstq 1to8
%endmacro
@@ -1607,11 +1672,11 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1
AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulqdq, clmul, 0, 1, 0
AVX_INSTR pcmpeqb, mmx, 0, 0, 1
AVX_INSTR pcmpeqd, mmx, 0, 0, 1
AVX_INSTR pcmpeqq, sse4, 0, 0, 1
@@ -1766,6 +1831,7 @@ GPR_INSTR blsi, bmi1
GPR_INSTR blsmsk, bmi1
GPR_INSTR blsr, bmi1
GPR_INSTR bzhi, bmi2
+GPR_INSTR crc32, sse42
GPR_INSTR mulx, bmi2
GPR_INSTR pdep, bmi2
GPR_INSTR pext, bmi2
diff --git a/third_party/dav1d/src/itx_1d.c b/third_party/dav1d/src/itx_1d.c
index ca14fc8c41..8f75c653af 100644
--- a/third_party/dav1d/src/itx_1d.c
+++ b/third_party/dav1d/src/itx_1d.c
@@ -1016,6 +1016,10 @@ void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
c[stride * i] *= 4;
}
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
assert(stride > 0);
const int in0 = c[0 * stride], in1 = c[1 * stride];
@@ -1032,3 +1036,4 @@ void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
c[2 * stride] = t1;
c[3 * stride] = t2 + t1;
}
+#endif
diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c
index 8ff245a0de..a226223c96 100644
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -159,6 +159,10 @@ inv_txfm_fn64(64, 16, 2)
inv_txfm_fn64(64, 32, 1)
inv_txfm_fn64(64, 64, 2)
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
coef *const coeff, const int eob
HIGHBD_DECL_SUFFIX)
@@ -179,6 +183,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
for (int x = 0; x < 4; x++)
dst[x] = iclip_pixel(dst[x] + *c++);
}
+#endif
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
@@ -236,7 +241,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
inv_txfm_add_identity_adst_##w##x##h##_c; \
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+#endif
assign_itx_all_fn84( 4, 4, );
assign_itx_all_fn84( 4, 8, R);
assign_itx_all_fn84( 4, 16, R);
diff --git a/third_party/dav1d/src/loongarch/msac.S b/third_party/dav1d/src/loongarch/msac.S
index c371eba4de..5bf18250a5 100644
--- a/third_party/dav1d/src/loongarch/msac.S
+++ b/third_party/dav1d/src/loongarch/msac.S
@@ -133,55 +133,58 @@ endconst
slli.d t4, t4, 48
vpickve2gr.d t6, vr2, 0
sub.d t6, t6, t4 // dif
- addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
addi.d a5, a0, 28 // cnt
- ld.w t7, a5, 0
- sub.w t7, t7, t4 // cnt-d
+ ld.w t0, a5, 0
sll.w t5, t5, t4
+ sub.w t7, t0, t4 // cnt-d
st.w t5, a4, 0 // store rng
- bge t7, zero, 9f
+ bgeu t0, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a5, 0 // store cnt
st.d t6, a6, 0 // store dif
@@ -208,7 +211,6 @@ function msac_decode_bool_lsx
srli.w t2, t0, 8 // r >> 8
mul.w t2, t2, a1
ld.w a5, a0, 28 // cnt
- addi.d t1, t1, 1 // dif + 1
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
@@ -226,49 +228,53 @@ function msac_decode_bool_lsx
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
- sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
+ sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
- bge t7, zero, 9f
+ bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
@@ -313,54 +319,56 @@ function msac_decode_bool_adapt_lsx
st.h t0, a1, 2
.renorm:
- // renorm
- addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
- addi.d t6, t6, -1 // dif
- sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
+ sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
- bge t7, zero, 9f
+ bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
- addi.d t1, a0, 8
- ld.d t1, t1, 0 // buf_end
+ ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
- blt t1, t2, 1f
+ bltu t1, t2, 2f
- ld.d t0, t0, 0 // next_bits
- addi.w t3, t7, 23 // shift_bits = cnt + 23
- addi.w t7, t7, 16 // cnt += 16
- revb.d t0, t0 // next_bits = bswap(next_bits)
- srli.w t4, t3, 3
- sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
- st.d t2, a0, 0
- andi t3, t3, 24 // shift_bits &= 24
- srl.d t0, t0, t3 // next_bits >>= shift_bits
- sub.w t3, t3, t7 // shift_bits -= 16 + cnt
- sll.d t0, t0, t3 // next_bits <<= shift_bits
- li.w t5, 48
- sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
- xor t6, t6, t0 // dif ^= next_bits
- b 9f
+ ld.d t3, t0, 0 // next_bits
+ addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
+ nor t3, t3, t3
+ sub.w t2, zero, t1
+ revb.d t3, t3 // next_bits = bswap(next_bits)
+ srli.w t2, t2, 3 // num_bytes_read
+ srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
+ b 3f
1:
- li.w t4, 40
- sub.w t5, t4, t7 // c = 40 - cnt
+ addi.w t3, t7, -48
+ srl.d t3, t3, t3 // pad with ones
+ b 4f
2:
- bge t0, t1, 3f
- ld.bu t2, t0, 0
- addi.d t0, t0, 1
- sll.d t2, t2, t5
- xor t6, t6, t2
- addi.w t5, t5, -8
- bge t5, zero, 2b
- // refill_eob_end
+ bgeu t0, t1, 1b
+ ld.d t3, t1, -8 // next_bits
+ sub.w t2, t2, t1
+ sub.w t1, t1, t0 // num_bytes_left
+ slli.w t2, t2, 3
+ srl.d t3, t3, t2
+ addi.w t2, t7, -48
+ nor t3, t3, t3
+ sub.w t4, zero, t2
+ revb.d t3, t3
+ srli.w t4, t4, 3
+ srl.d t3, t3, t2
+ sltu t2, t1, t4
+ maskeqz t1, t1, t2
+ masknez t2, t4, t2
+ or t2, t2, t1 // num_bytes_read
3:
- st.d t0, a0, 0 // s->buf_pos = buf_pos
- sub.w t7, t4, t5 // cnt = 40 - c
+ slli.w t1, t2, 3
+ add.d t0, t0, t2
+ add.w t7, t7, t1 // cnt += num_bits_read
+ st.d t0, a0, 0
+4:
+ or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c
index 43d8ae5d07..971ba85e29 100644
--- a/third_party/dav1d/src/msac.c
+++ b/third_party/dav1d/src/msac.c
@@ -43,15 +43,40 @@ static inline void ctx_refill(MsacContext *const s) {
const uint8_t *buf_end = s->buf_end;
int c = EC_WIN_SIZE - s->cnt - 24;
ec_win dif = s->dif;
- while (c >= 0 && buf_pos < buf_end) {
- dif ^= ((ec_win)*buf_pos++) << c;
+ do {
+ if (buf_pos >= buf_end) {
+ // set remaining bits to 1;
+ dif |= ~(~(ec_win)0xff << c);
+ break;
+ }
+ dif |= (ec_win)(*buf_pos++ ^ 0xff) << c;
c -= 8;
- }
+ } while (c >= 0);
s->dif = dif;
s->cnt = EC_WIN_SIZE - c - 24;
s->buf_pos = buf_pos;
}
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+ const int n, unsigned k)
+{
+ assert(n >> k == 8);
+
+ unsigned a = 0;
+ if (dav1d_msac_decode_bool_equi(s)) {
+ if (dav1d_msac_decode_bool_equi(s))
+ k += dav1d_msac_decode_bool_equi(s) + 1;
+ a = 1 << k;
+ }
+ const unsigned v = dav1d_msac_decode_bools(s, k) + a;
+ return ref * 2 <= n ? inv_recenter(ref, v) :
+ n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+ ARCH_AARCH64 || \
+ (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
/* Takes updated dif and range values, renormalizes them so that
* 32768 <= rng < 65536 (reading more bytes from the stream into dif if
* necessary), and stores them back in the decoder context.
@@ -61,11 +86,13 @@ static inline void ctx_norm(MsacContext *const s, const ec_win dif,
const unsigned rng)
{
const int d = 15 ^ (31 ^ clz(rng));
+ const int cnt = s->cnt;
assert(rng <= 65535U);
- s->cnt -= d;
- s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+ s->dif = dif << d;
s->rng = rng << d;
- if (s->cnt < 0)
+ s->cnt = cnt - d;
+ // unsigned compare avoids redundant refills at eob
+ if ((unsigned)cnt < (unsigned)d)
ctx_refill(s);
}
@@ -100,22 +127,6 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
return !ret;
}
-int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
- const int n, unsigned k)
-{
- assert(n >> k == 8);
-
- unsigned a = 0;
- if (dav1d_msac_decode_bool_equi(s)) {
- if (dav1d_msac_decode_bool_equi(s))
- k += dav1d_msac_decode_bool_equi(s) + 1;
- a = 1 << k;
- }
- const unsigned v = dav1d_msac_decode_bools(s, k) + a;
- return ref * 2 <= n ? inv_recenter(ref, v) :
- n - 1 - inv_recenter(n - 1 - ref, v);
-}
-
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
@@ -188,13 +199,14 @@ unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
}
return tok;
}
+#endif
void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
const size_t sz, const int disable_cdf_update_flag)
{
s->buf_pos = data;
s->buf_end = data + sz;
- s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+ s->dif = 0;
s->rng = 0x8000;
s->cnt = -15;
s->allow_update_cdf = !disable_cdf_update_flag;
diff --git a/third_party/dav1d/src/ppc/cdef_tmpl.c b/third_party/dav1d/src/ppc/cdef_tmpl.c
index e2e759810f..6ef87ad448 100644
--- a/third_party/dav1d/src/ppc/cdef_tmpl.c
+++ b/third_party/dav1d/src/ppc/cdef_tmpl.c
@@ -29,11 +29,10 @@
#if BITDEPTH == 8
static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
- const int damping)
+ const uint16_t shift)
{
const i16x8 zero = vec_splat_s16(0);
if (!threshold) return zero;
- const uint16_t shift = imax(0, damping - ulog2(threshold));
const i16x8 abs_diff = vec_abs(diff);
const b16x8 mask = vec_cmplt(diff, zero);
const i16x8 thr = vec_splats(threshold);
@@ -44,7 +43,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
return vec_sel(min, neg, mask);
}
-static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy4xN(uint16_t *tmp,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], const uint8_t *const top,
const uint8_t *const bottom, const int w, const int h,
@@ -114,7 +113,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
}
}
-static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy8xN(uint16_t *tmp,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], const uint8_t *const top,
const uint8_t *const bottom, const int w, const int h,
@@ -218,16 +217,12 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
#define LOAD_PIX(addr) \
const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
- i16x8 max = px; \
- i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_PIX4(addr) \
const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
- const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \
const i16x8 px = vec_xxpermdi(a, b, 0); \
- i16x8 max = px; \
- i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_DIR(p, addr, o0, o1) \
@@ -238,22 +233,26 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
#define LOAD_DIR4(p, addr, o0, o1) \
LOAD_DIR(p ## a, addr, o0, o1) \
- LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+ LOAD_DIR(p ## b, addr + 8, o0, o1) \
const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
-#define CONSTRAIN(p, strength) \
+#define CONSTRAIN(p, strength, shift) \
const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
\
- i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
- i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
- i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
- i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \
+ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \
+ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \
+ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift);
+
+#define SETUP_MINMAX \
+ i16x8 max = px; \
+ i16x8 min = px; \
#define MIN_MAX(p) \
max = max_mask(p ## 0, max); \
@@ -265,19 +264,16 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
max = max_mask(p ## 3, max); \
min = vec_min(p ## 3, min);
-#define PRI_0(p) \
- p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
- p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+#define MAKE_TAPS \
+ const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \
+ const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \
+ const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd));
-#define PRI_1(p) \
- p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
- p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
-
-#define SEC_0(p) \
- p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
- p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
- p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
- p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+#define PRI_0_UPDATE_SUM(p) \
+ sum = vec_madd(tap0, p ## _c0, sum); \
+ sum = vec_madd(tap0, p ## _c1, sum); \
+ sum = vec_madd(tap1, p ## _c2, sum); \
+ sum = vec_madd(tap1, p ## _c3, sum);
#define UPDATE_SUM(p) \
const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
@@ -285,92 +281,198 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) {
sum = vec_add(sum, p ## sum0); \
sum = vec_add(sum, p ## sum1);
+#define SEC_0_UPDATE_SUM(p) \
+ sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \
+ sum = vec_madd(vec_splat_s16(2), p ## _c3, sum);
+
+#define BIAS \
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \
+ bias = vec_sub(vec_splat_s16(8), bias); \
+
+#define STORE4 \
+ dst[0] = vdst[0]; \
+ dst[1] = vdst[1]; \
+ dst[2] = vdst[2]; \
+ dst[3] = vdst[3]; \
+\
+ tmp += 8; \
+ dst += PXSTRIDE(dst_stride); \
+ dst[0] = vdst[4]; \
+ dst[1] = vdst[5]; \
+ dst[2] = vdst[6]; \
+ dst[3] = vdst[7]; \
+\
+ tmp += 8; \
+ dst += PXSTRIDE(dst_stride);
+
+#define STORE4_CLAMPED \
+ BIAS \
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+ STORE4
+
+#define STORE4_UNCLAMPED \
+ BIAS \
+ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ STORE4
+
+#define STORE8 \
+ dst[0] = vdst[0]; \
+ dst[1] = vdst[1]; \
+ dst[2] = vdst[2]; \
+ dst[3] = vdst[3]; \
+ dst[4] = vdst[4]; \
+ dst[5] = vdst[5]; \
+ dst[6] = vdst[6]; \
+ dst[7] = vdst[7]; \
+\
+ tmp += 16; \
+ dst += PXSTRIDE(dst_stride);
+
+#define STORE8_CLAMPED \
+ BIAS \
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+ STORE8
+
+#define STORE8_UNCLAMPED \
+ BIAS \
+ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+ STORE8
+
+#define DIRECTIONS(w, tmp_stride) \
+ static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \
+ };
+
+DIRECTIONS(4, 8)
+DIRECTIONS(8, 16)
+
static inline void
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], const pixel *const top,
const pixel *const bottom, const int w, const int h,
const int pri_strength, const int sec_strength, const int dir,
- const int damping, const enum CdefEdgeFlags edges,
- const ptrdiff_t tmp_stride, uint16_t *tmp)
+ const int pri_shift, const int sec_shift,
+ const enum CdefEdgeFlags edges, uint16_t *tmp)
{
- const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
- { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
- { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
- { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
- };
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
- const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
- const int off1 = cdef_directions[dir][0];
- const int off1_1 = cdef_directions[dir][1];
+ const int off1 = cdef_directions4[dir][0];
+ const int off1_1 = cdef_directions4[dir][1];
- const int off2 = cdef_directions[(dir + 2) & 7][0];
- const int off3 = cdef_directions[(dir + 6) & 7][0];
+ const int off2 = cdef_directions4[(dir + 2) & 7][0];
+ const int off3 = cdef_directions4[(dir + 6) & 7][0];
- const int off2_1 = cdef_directions[(dir + 2) & 7][1];
- const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+ const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
- copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+ MAKE_TAPS
for (int y = 0; y < h / 2; y++) {
LOAD_PIX4(tmp)
+ SETUP_MINMAX
+
// Primary pass
LOAD_DIR4(p, tmp, off1, off1_1)
- CONSTRAIN(p, pri_strength)
+ CONSTRAIN(p, pri_strength, pri_shift)
MIN_MAX(p)
- PRI_0(p)
- PRI_1(p)
-
- UPDATE_SUM(p)
+ PRI_0_UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR4(s, tmp, off2, off3)
- CONSTRAIN(s, sec_strength)
+ CONSTRAIN(s, sec_strength, sec_shift)
MIN_MAX(s)
- SEC_0(s)
-
- UPDATE_SUM(s)
+ SEC_0_UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR4(s2, tmp, off2_1, off3_1)
- CONSTRAIN(s2, sec_strength)
+ CONSTRAIN(s2, sec_strength, sec_shift)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
- i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
- bias = vec_sub(vec_splat_s16(8), bias);
- i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
- i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
- dst[0] = vdst[0];
- dst[1] = vdst[1];
- dst[2] = vdst[2];
- dst[3] = vdst[3];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
- dst[0] = vdst[4];
- dst[1] = vdst[5];
- dst[2] = vdst[6];
- dst[3] = vdst[7];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
+ STORE4_CLAMPED
+ }
+}
+
+static inline void
+filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int dir,
+ const int pri_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions4[dir][0];
+ const int off1_1 = cdef_directions4[dir][1];
+
+ MAKE_TAPS
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+
+ // Primary pass
+ LOAD_DIR4(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength, pri_shift)
+
+ PRI_0_UPDATE_SUM(p)
+
+ STORE4_UNCLAMPED
+ }
+}
+
+static inline void
+filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int sec_strength, const int dir,
+ const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int off2 = cdef_directions4[(dir + 2) & 7][0];
+ const int off3 = cdef_directions4[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+ // Secondary pass 1
+ LOAD_DIR4(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength, sec_shift)
+
+ SEC_0_UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength, sec_shift)
+
+ UPDATE_SUM(s2)
+
+ STORE4_UNCLAMPED
}
}
@@ -379,88 +481,121 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], const pixel *const top,
const pixel *const bottom, const int w, const int h,
const int pri_strength, const int sec_strength, const int dir,
- const int damping, const enum CdefEdgeFlags edges,
- const ptrdiff_t tmp_stride, uint16_t *tmp)
+ const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
{
- const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
- { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
- { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
- { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
- { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
- { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
- };
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions8[dir][0];
+ const int off1_1 = cdef_directions8[dir][1];
- const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
- const int off1 = cdef_directions[dir][0];
- const int off1_1 = cdef_directions[dir][1];
+ const int off2 = cdef_directions8[(dir + 2) & 7][0];
+ const int off3 = cdef_directions8[(dir + 6) & 7][0];
- const int off2 = cdef_directions[(dir + 2) & 7][0];
- const int off3 = cdef_directions[(dir + 6) & 7][0];
+ const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
- const int off2_1 = cdef_directions[(dir + 2) & 7][1];
- const int off3_1 = cdef_directions[(dir + 6) & 7][1];
-
- copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+ MAKE_TAPS
for (int y = 0; y < h; y++) {
LOAD_PIX(tmp)
+ SETUP_MINMAX
+
// Primary pass
LOAD_DIR(p, tmp, off1, off1_1)
- CONSTRAIN(p, pri_strength)
+ CONSTRAIN(p, pri_strength, pri_shift)
MIN_MAX(p)
- PRI_0(p)
- PRI_1(p)
-
- UPDATE_SUM(p)
+ PRI_0_UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR(s, tmp, off2, off3)
- CONSTRAIN(s, sec_strength)
+ CONSTRAIN(s, sec_strength, sec_shift)
MIN_MAX(s)
- SEC_0(s)
-
- UPDATE_SUM(s)
+ SEC_0_UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR(s2, tmp, off2_1, off3_1)
- CONSTRAIN(s2, sec_strength)
+ CONSTRAIN(s2, sec_strength, sec_shift)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
- i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
- bias = vec_sub(vec_splat_s16(8), bias);
- i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
- i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
- dst[0] = vdst[0];
- dst[1] = vdst[1];
- dst[2] = vdst[2];
- dst[3] = vdst[3];
- dst[4] = vdst[4];
- dst[5] = vdst[5];
- dst[6] = vdst[6];
- dst[7] = vdst[7];
-
- tmp += tmp_stride;
- dst += PXSTRIDE(dst_stride);
+ STORE8_CLAMPED
+ }
+
+}
+
+static inline void
+filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int dir,
+ const int pri_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int off1 = cdef_directions8[dir][0];
+ const int off1_1 = cdef_directions8[dir][1];
+
+ MAKE_TAPS
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Primary pass
+ LOAD_DIR(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength, pri_shift)
+
+ PRI_0_UPDATE_SUM(p)
+
+ STORE8_UNCLAMPED
}
+}
+
+static inline void
+filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int sec_strength, const int dir,
+ const int sec_shift, const enum CdefEdgeFlags edges,
+ uint16_t *tmp)
+{
+ const int off2 = cdef_directions8[(dir + 2) & 7][0];
+ const int off3 = cdef_directions8[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Secondary pass 1
+ LOAD_DIR(s, tmp, off2, off3)
+ CONSTRAIN(s, sec_strength, sec_shift)
+
+ SEC_0_UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength, sec_shift)
+
+ UPDATE_SUM(s2)
+
+ STORE8_UNCLAMPED
+ }
}
#define cdef_fn(w, h, tmp_stride) \
@@ -477,8 +612,22 @@ void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
- filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
- sec_strength, dir, damping, edges, tmp_stride, tmp); \
+ copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \
+ if (pri_strength) { \
+ const int pri_shift = imax(0, damping - ulog2(pri_strength)); \
+ if (sec_strength) { \
+ const int sec_shift = damping - ulog2(sec_strength); \
+ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ sec_strength, dir, pri_shift, sec_shift, edges, tmp); \
+ } else { \
+ filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ dir, pri_shift, edges, tmp); \
+ } \
+ } else { \
+ const int sec_shift = damping - ulog2(sec_strength); \
+ filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \
+ dir, sec_shift, edges, tmp); \
+ } \
}
cdef_fn(4, 4, 8);
diff --git a/third_party/dav1d/src/riscv/64/itx.S b/third_party/dav1d/src/riscv/64/itx.S
index 60d045150d..dfec548e40 100644
--- a/third_party/dav1d/src/riscv/64/itx.S
+++ b/third_party/dav1d/src/riscv/64/itx.S
@@ -163,48 +163,48 @@ endfunc
vssub.vv \o3, v16, v20
.endm
-.macro iadst_4 o0, o1, o2, o3
+.macro iadst_4 o0, o1, o2, o3, lm2, lm
li t1, 1321
li t2, 3803
li t3, 2482
- vwmul.vx v4, v0, t1
- vwmul.vx v5, v0, t3
+ vwmul.vx v16, v0, t1
+ vwmul.vx v18, v0, t3
neg t1, t1
- vwmacc.vx v4, t2, v2
- vwmacc.vx v5, t1, v2
+ vwmacc.vx v16, t2, v2
+ vwmacc.vx v18, t1, v2
neg t2, t2
- vwmacc.vx v4, t3, v3
- vwmacc.vx v5, t2, v3
+ vwmacc.vx v16, t3, v3
+ vwmacc.vx v18, t2, v3
- vwsub.vv v6, v0, v2
- vwadd.wv v6, v6, v3
+ vwsub.vv v20, v0, v2
+ vwadd.wv v20, v20, v3
li t1, 3344
- vwmul.vx v7, v1, t1
+ vwmul.vx v22, v1, t1
- vsetvli zero, zero, e32, m1, ta, ma
+ vsetvli zero, zero, e32, \lm2, ta, ma
- vmul.vx v6, v6, t1
+ vmul.vx v20, v20, t1
- vadd.vv v8, v4, v5
- vadd.vv v4, v4, v7
- vadd.vv v5, v5, v7
- vsub.vv v7, v8, v7
+ vadd.vv v24, v16, v18
+ vadd.vv v16, v16, v22
+ vadd.vv v18, v18, v22
+ vsub.vv v22, v24, v22
li t1, 2048
- vadd.vx v4, v4, t1
- vadd.vx v5, v5, t1
- vadd.vx v6, v6, t1
- vadd.vx v7, v7, t1
+ vadd.vx v16, v16, t1
+ vadd.vx v18, v18, t1
+ vadd.vx v20, v20, t1
+ vadd.vx v22, v22, t1
- vsetvli zero, zero, e16, mf2, ta, ma
+ vsetvli zero, zero, e16, \lm, ta, ma
- vnsra.wi \o0, v4, 12
- vnsra.wi \o1, v5, 12
- vnsra.wi \o2, v6, 12
- vnsra.wi \o3, v7, 12
+ vnsra.wi \o0, v16, 12
+ vnsra.wi \o1, v18, 12
+ vnsra.wi \o2, v20, 12
+ vnsra.wi \o3, v22, 12
.endm
function inv_dct_e16_x4_rvv, export=1, ext=v
@@ -213,12 +213,22 @@ function inv_dct_e16_x4_rvv, export=1, ext=v
endfunc
function inv_adst_e16_x4_rvv, export=1, ext=v
- iadst_4 v0, v1, v2, v3
+ iadst_4 v0, v1, v2, v3, m1, mf2
jr t0
endfunc
function inv_flipadst_e16_x4_rvv, export=1, ext=v
- iadst_4 v3, v2, v1, v0
+ iadst_4 v3, v2, v1, v0, m1, mf2
+ jr t0
+endfunc
+
+function inv_adst_e16_x4w_rvv, export=1, ext=v
+ iadst_4 v0, v1, v2, v3, m2, m1
+ jr t0
+endfunc
+
+function inv_flipadst_e16_x4w_rvv, export=1, ext=v
+ iadst_4 v3, v2, v1, v0, m2, m1
jr t0
endfunc
@@ -328,6 +338,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
.ifc \variant, identity_
// The identity vsadd.vv and downshift vssra.vi 1 cancel out
+
+ j L(itx_8x8_epilog)
.else
jalr t0, a4
@@ -339,8 +351,8 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
vssra.vi v5, v5, 1
vssra.vi v6, v6, 1
vssra.vi v7, v7, 1
-.endif
+L(itx_8x8_epilog):
vsseg8e16.v v0, (a2)
vle16.v v0, (a2)
addi t0, a2, 16
@@ -374,9 +386,7 @@ function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
vmv.v.x v8, zero
vse16.v v8, (a2)
-.ifc \variant, identity_
itx_8x8_end:
-.endif
vsetivli zero, 8, e8, mf2, ta, ma
vle8.v v8, (a0)
add t0, a0, a1
@@ -441,11 +451,12 @@ itx_8x8_end:
vse8.v v15, (a0)
ret
+.endif
endfunc
.endm
-def_fn_8x8_base
def_fn_8x8_base identity_
+def_fn_8x8_base
function inv_identity_e16_x8_rvv, export=1, ext=v
vsadd.vv v0, v0, v0
@@ -530,23 +541,23 @@ endfunc
li t5, 2598
li t6, 3166
- vwmul.vx v8, v7, t1
+ vwmul.vx v16, v7, t1
neg t1, t1
- vwmul.vx v10, v7, t2
- vwmacc.vx v8, t2, v0
- vwmacc.vx v10, t1, v0
+ vwmul.vx v18, v7, t2
+ vwmacc.vx v16, t2, v0
+ vwmacc.vx v18, t1, v0
- vwmul.vx v12, v5, t3
+ vwmul.vx v20, v5, t3
neg t3, t3
- vwmul.vx v14, v5, t4
- vwmacc.vx v12, t4, v2
- vwmacc.vx v14, t3, v2
+ vwmul.vx v22, v5, t4
+ vwmacc.vx v20, t4, v2
+ vwmacc.vx v22, t3, v2
- vwmul.vx v16, v3, t5
+ vwmul.vx v24, v3, t5
neg t5, t5
- vwmul.vx v18, v3, t6
- vwmacc.vx v16, t6, v4
- vwmacc.vx v18, t5, v4
+ vwmul.vx v26, v3, t6
+ vwmacc.vx v24, t6, v4
+ vwmacc.vx v26, t5, v4
li t1, 2048
li t2, 1189
@@ -555,95 +566,95 @@ endfunc
li t5, 3784
li t6, 2896
- vwmul.vx v20, v1, t2
+ vwmul.vx v28, v1, t2
neg t2, t2
- vwmul.vx v22, v1, t3
- vwmacc.vx v20, t3, v6
- vwmacc.vx v22, t2, v6
-
- vwadd.wx v8, v8, t1
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
+ vwmul.vx v30, v1, t3
+ vwmacc.vx v28, t3, v6
+ vwmacc.vx v30, t2, v6
+
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+ vwadd.wx v26, v26, t1
+ vwadd.wx v28, v28, t1
+ vwadd.wx v30, v30, t1
- vnsra.wi v8, v8, 12
- vnsra.wi v10, v10, 12
- vnsra.wi v12, v12, 12
- vnsra.wi v14, v14, 12
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
+ vnsra.wi v24, v24, 12
+ vnsra.wi v26, v26, 12
+ vnsra.wi v28, v28, 12
+ vnsra.wi v30, v30, 12
- vssub.vv v4, v8, v16
- vsadd.vv v8, v8, v16
- vsadd.vv v1, v10, v18
- vsadd.vv v2, v12, v20
- vsadd.vv v3, v14, v22
- vssub.vv v5, v10, v18
- vssub.vv v6, v12, v20
- vssub.vv v22, v14, v22
-
- vsadd.vv \o0, v8, v2
- vsadd.vv \o7, v1, v3
- vssub.vv v2, v8, v2
- vssub.vv v3, v1, v3
-
- vwmul.vx v8, v4, t5
- vwmul.vx v10, v4, t4
- vwmul.vx v12, v22, t5
- vwmul.vx v14, v22, t4
- vwmacc.vx v8, t4, v5
+ vssub.vv v4, v16, v24
+ vsadd.vv v16, v16, v24
+ vsadd.vv v1, v18, v26
+ vsadd.vv v2, v20, v28
+ vsadd.vv v3, v22, v30
+ vssub.vv v5, v18, v26
+ vssub.vv v6, v20, v28
+ vssub.vv v30, v22, v30
+
+ vsadd.vv \o0, v16, v2
+ vsadd.vv \o7, v1, v3
+ vssub.vv v2, v16, v2
+ vssub.vv v3, v1, v3
+
+ vwmul.vx v16, v4, t5
+ vwmul.vx v18, v4, t4
+ vwmul.vx v20, v30, t5
+ vwmul.vx v22, v30, t4
+ vwmacc.vx v16, t4, v5
neg t4, t4
- vwmacc.vx v14, t5, v6
+ vwmacc.vx v22, t5, v6
neg t5, t5
- vwmacc.vx v12, t4, v6
- vwmacc.vx v10, t5, v5
-
- vwadd.wx v8, v8, t1
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
-
- vnsra.wi v8, v8, 12
- vnsra.wi v10, v10, 12
- vnsra.wi v12, v12, 12
- vnsra.wi v14, v14, 12
-
- vsadd.vv \o1, v8, v12
- vsadd.vv \o6, v10, v14
- vssub.vv v8, v8, v12
- vssub.vv v9, v10, v14
-
- vwmul.vx v10, v2, t6
- vwmul.vx v12, v2, t6
- vwmul.vx v14, v8, t6
- vwmul.vx v16, v8, t6
- vwmacc.vx v10, t6, v3
- vwmacc.vx v14, t6, v9
- neg t6, t6
- vwmacc.vx v12, t6, v3
- vwmacc.vx v16, t6, v9
+ vwmacc.vx v20, t4, v6
+ vwmacc.vx v18, t5, v5
- vwadd.wx v10, v10, t1
- vwadd.wx v12, v12, t1
- vwadd.wx v14, v14, t1
vwadd.wx v16, v16, t1
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+
+ vnsra.wi v16, v16, 12
+ vnsra.wi v18, v18, 12
+ vnsra.wi v20, v20, 12
+ vnsra.wi v22, v22, 12
- vnsra.wi \o3, v10, 12
- vnsra.wi \o4, v12, 12
- vnsra.wi \o2, v14, 12
- vnsra.wi \o5, v16, 12
+ vsadd.vv \o1, v16, v20
+ vsadd.vv \o6, v18, v22
+ vssub.vv v16, v16, v20
+ vssub.vv v17, v18, v22
+
+ vwmul.vx v18, v2, t6
+ vwmul.vx v20, v2, t6
+ vwmul.vx v22, v16, t6
+ vwmul.vx v24, v16, t6
+ vwmacc.vx v18, t6, v3
+ vwmacc.vx v22, t6, v17
+ neg t6, t6
+ vwmacc.vx v20, t6, v3
+ vwmacc.vx v24, t6, v17
- vmv.v.x v8, zero
- vssub.vv \o1, v8, \o1
- vssub.vv \o3, v8, \o3
- vssub.vv \o5, v8, \o5
- vssub.vv \o7, v8, \o7
+ vwadd.wx v18, v18, t1
+ vwadd.wx v20, v20, t1
+ vwadd.wx v22, v22, t1
+ vwadd.wx v24, v24, t1
+
+ vnsra.wi \o3, v18, 12
+ vnsra.wi \o4, v20, 12
+ vnsra.wi \o2, v22, 12
+ vnsra.wi \o5, v24, 12
+
+ vmv.v.x v16, zero
+ vssub.vv \o1, v16, \o1
+ vssub.vv \o3, v16, \o3
+ vssub.vv \o5, v16, \o5
+ vssub.vv \o7, v16, \o7
.endm
function inv_dct_e16_x8_rvv, export=1, ext=v
@@ -714,6 +725,206 @@ def_fn_8x8 flipadst, identity
def_fn_8x8 identity, adst
def_fn_8x8 identity, flipadst
+function inv_txfm_add_4x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+ addi t0, t0, 16
+ vle16.v v2, (t0)
+ addi t0, t0, 16
+ vle16.v v3, (t0)
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vsseg4e16.v v0, (a2)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vmv.v.x v8, zero
+ vle16.v v0, (a2)
+ vse16.v v8, (a2)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi a2, a2, 8
+ vle16.v v\i, (a2)
+ vse16.v v8, (a2)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+ vle8.v v8, (a0)
+ add t0, a0, a1
+ vle8.v v9, (t0)
+.irp i, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ vse8.v v8, (a0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add a0, a0, a1
+ vse8.v v\i, (a0)
+.endr
+
+ ret
+endfunc
+
+function inv_txfm_add_8x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vsseg8e16.v v0, (a2)
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vmv.v.x v4, zero
+ vle16.v v0, (a2)
+ vse16.v v4, (a2)
+.irp i, 1, 2, 3
+ addi a2, a2, 16
+ vle16.v v\i, (a2)
+ vse16.v v4, (a2)
+.endr
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ vle8.v v4, (a0)
+ add t0, a0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, m1, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ vse8.v v4, (a0)
+ add a0, a0, a1
+ vse8.v v5, (a0)
+ add a0, a0, a1
+ vse8.v v6, (a0)
+ add a0, a0, a1
+ vse8.v v7, (a0)
+
+ ret
+endfunc
+
+/* Define symbols added in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+ la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.else
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+ la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+ j inv_txfm_add_\w\()x\h\()_rvv
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
function inv_identity_e16_x16_rvv, export=1, ext=v
li t1, 2*(5793-4096)*8
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1196,10 +1407,12 @@ endfunc
.macro def_horz_16 variant
function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
vmv.v.x v16, zero
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vle16.v v\i, (t4)
+ vle16.v v0, (t4)
vse16.v v16, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add t4, t4, t6
+ vle16.v v\i, (t4)
+ vse16.v v16, (t4)
.endr
.ifc \variant, _identity
li t1, 2*(5793-4096)*8
@@ -1208,29 +1421,35 @@ function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
vsra.vi v16, v16, 1
vaadd.vv v\i, v\i, v16
.endr
+ j L(horz_16x8_epilog)
.else
jalr t0, a4
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
vssra.vi v\i, v\i, 2
.endr
-.endif
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vsse16.v v\i, (t5), t6
+L(horz_16x8_epilog):
+ vsse16.v v0, (t5), t6
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
addi t5, t5, 2
+ vsse16.v v\i, (t5), t6
.endr
jr a7
+.endif
endfunc
.endm
-def_horz_16
def_horz_16 _identity
+def_horz_16
function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
vsetivli zero, 8, e16, m1, ta, ma
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- vle16.v v\i, (t4)
+
+ vle16.v v0, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add t4, t4, t6
+ vle16.v v\i, (t4)
.endr
+
jalr t0, a5
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1238,10 +1457,13 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
.endr
vsetivli zero, 8, e8, mf2, ta, ma
- mv t0, t5
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- vle8.v v\i, (t0)
+
+ vle8.v v16, (t5)
+ add t0, t5, a1
+ vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
add t0, t0, a1
+ vle8.v v\i, (t0)
.endr
vwaddu.wv v0, v0, v16
@@ -1284,9 +1506,10 @@ function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
vnclipu.wi v30, v14, 0
vnclipu.wi v31, v15, 0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- vse8.v v\i, (t5)
+ vse8.v v16, (t5)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
add t5, t5, a1
+ vse8.v v\i, (t5)
.endr
jr a7
@@ -1296,11 +1519,26 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
addi sp, sp, -16*32
-.irp i, 0, 8
+.irp i, 8, 0
addi t4, a2, \i*2
addi t5, sp, \i*16*2
+.if \i == 8
+ blt a3, a7, 1f
+.endif
li t6, 16*2
jalr a7, a6
+.if \i == 8
+ j 2f
+1:
+ li t1, 64
+ vsetvli zero, t1, e16, m8, ta, ma
+ vmv.v.x v0, zero
+ vse16.v v0, (t5)
+ addi t5, t5, 128
+ vse16.v v0, (t5)
+ vsetivli zero, 8, e16, m1, ta, ma
+2:
+.endif
.endr
.irp i, 0, 8
addi t4, sp, \i*2
@@ -1312,7 +1550,7 @@ function inv_txfm_add_16x16_rvv, export=1, ext=v
ret
endfunc
-.macro def_fn_16x16 txfm1, txfm2
+.macro def_fn_16x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
.ifc \txfm1, identity
la a6, inv_txfm_horz_identity_16x8_rvv
@@ -1321,19 +1559,558 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
la a4, inv_\txfm1\()_e16_x16_rvv
.endif
la a5, inv_\txfm2\()_e16_x16_rvv
+ li a7, \eob_half
j inv_txfm_add_16x16_rvv
endfunc
.endm
-def_fn_16x16 dct, dct
-def_fn_16x16 identity, identity
-def_fn_16x16 dct, adst
-def_fn_16x16 dct, flipadst
-def_fn_16x16 dct, identity
-def_fn_16x16 adst, dct
-def_fn_16x16 adst, adst
-def_fn_16x16 adst, flipadst
-def_fn_16x16 flipadst, dct
-def_fn_16x16 flipadst, adst
-def_fn_16x16 flipadst, flipadst
-def_fn_16x16 identity, dct
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ blt a3, a6, 1f
+
+ addi t0, a2, 16
+ vle16.v v0, (t0)
+ addi t0, t0, 32
+ vle16.v v1, (t0)
+ addi t0, t0, 32
+ vle16.v v2, (t0)
+ addi t0, t0, 32
+ vle16.v v3, (t0)
+
+.ifc \variant, identity_
+ li t1, (5793-4096)*8
+ vsmul.vx v8, v0, t1
+ vaadd.vv v4, v0, v8
+ vsmul.vx v8, v1, t1
+ vaadd.vv v5, v1, v8
+ vsmul.vx v8, v2, t1
+ vaadd.vv v6, v2, v8
+ vsmul.vx v8, v3, t1
+ vaadd.vv v7, v3, v8
+.else
+ jalr t0, a4
+
+ vssra.vi v4, v0, 1
+ vssra.vi v5, v1, 1
+ vssra.vi v6, v2, 1
+ vssra.vi v7, v3, 1
+.endif
+
+ j 2f
+
+1:
+.irp i, 4, 5, 6, 7
+ vmv.v.x v\i, zero
+.endr
+
+2:
+ vle16.v v0, (a2)
+ addi t0, a2, 32
+ vle16.v v1, (t0)
+ addi t0, t0, 32
+ vle16.v v2, (t0)
+ addi t0, t0, 32
+ vle16.v v3, (t0)
+
+.ifc \variant, identity_
+ li t1, (5793-4096)*8
+.irp i, 0, 1, 2, 3
+ vsmul.vx v8, v\i, t1
+ vaadd.vv v\i, v\i, v8
+.endr
+
+ j L(itx_4x16_epilog)
+.else
+ jalr t0, a4
+
+ vssra.vi v0, v0, 1
+ vssra.vi v1, v1, 1
+ vssra.vi v2, v2, 1
+ vssra.vi v3, v3, 1
+
+L(itx_4x16_epilog):
+ vsseg4e16.v v0, (a2)
+ addi t0, a2, 64
+ vsseg4e16.v v4, (t0)
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+
+ vmv.v.x v16, zero
+ vle16.v v0, (a2)
+ vse16.v v16, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+ vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vle8.v v16, (a0)
+ add t0, a0, a1
+ vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v16
+ vwaddu.wv v1, v1, v17
+ vwaddu.wv v2, v2, v18
+ vwaddu.wv v3, v3, v19
+ vwaddu.wv v4, v4, v20
+ vwaddu.wv v5, v5, v21
+ vwaddu.wv v6, v6, v22
+ vwaddu.wv v7, v7, v23
+ vwaddu.wv v8, v8, v24
+ vwaddu.wv v9, v9, v25
+ vwaddu.wv v10, v10, v26
+ vwaddu.wv v11, v11, v27
+ vwaddu.wv v12, v12, v28
+ vwaddu.wv v13, v13, v29
+ vwaddu.wv v14, v14, v30
+ vwaddu.wv v15, v15, v31
+
+ vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+
+ vnclipu.wi v16, v0, 0
+ vnclipu.wi v17, v1, 0
+ vnclipu.wi v18, v2, 0
+ vnclipu.wi v19, v3, 0
+ vnclipu.wi v20, v4, 0
+ vnclipu.wi v21, v5, 0
+ vnclipu.wi v22, v6, 0
+ vnclipu.wi v23, v7, 0
+ vnclipu.wi v24, v8, 0
+ vnclipu.wi v25, v9, 0
+ vnclipu.wi v26, v10, 0
+ vnclipu.wi v27, v11, 0
+ vnclipu.wi v28, v12, 0
+ vnclipu.wi v29, v13, 0
+ vnclipu.wi v30, v14, 0
+ vnclipu.wi v31, v15, 0
+
+ vse8.v v16, (a0)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ add a0, a0, a1
+ vse8.v v\i, (a0)
+.endr
+
+ ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 8
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 8
+ vle16.v v\i, (t0)
+.endr
+
+.ifc \variant, identity_
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vssra.vi v16, v16, 1
+ vsadd.vv v\i, v\i, v16
+.endr
+
+ j L(itx_16x4_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x4_epilog):
+ li t0, 32
+ vssseg8e16.v v0, (a2), t0
+ addi t1, a2, 16
+ vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ vmv.v.x v4, zero
+ addi t0, a2, \j*2
+ vle16.v v0, (t0)
+ vse16.v v4, (t0)
+.irp i, 1, 2, 3
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v4, (t0)
+.endr
+
+ jalr t0, a5
+
+ vssra.vi v0, v0, 4
+ vssra.vi v1, v1, 4
+ vssra.vi v2, v2, 4
+ vssra.vi v3, v3, 4
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ addi t0, a0, \j
+ vle8.v v4, (t0)
+ add t0, t0, a1
+ vle8.v v5, (t0)
+ add t0, t0, a1
+ vle8.v v6, (t0)
+ add t0, t0, a1
+ vle8.v v7, (t0)
+
+ vwaddu.wv v0, v0, v4
+ vwaddu.wv v1, v1, v5
+ vwaddu.wv v2, v2, v6
+ vwaddu.wv v3, v3, v7
+
+ vsetvli zero, zero, e16, m1, ta, ma
+ vmax.vx v0, v0, zero
+ vmax.vx v1, v1, zero
+ vmax.vx v2, v2, zero
+ vmax.vx v3, v3, zero
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v4, v0, 0
+ vnclipu.wi v5, v1, 0
+ vnclipu.wi v6, v2, 0
+ vnclipu.wi v7, v3, 0
+
+ addi t0, a0, \j
+ vse8.v v4, (t0)
+ add t0, t0, a1
+ vse8.v v5, (t0)
+ add t0, t0, a1
+ vse8.v v6, (t0)
+ add t0, t0, a1
+ vse8.v v7, (t0)
+.endr
+
+ ret
+.endif
+endfunc
+.endm
+
+def_fn_416_base identity_
+def_fn_416_base
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+ la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.elseif \txfm1 != identity
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+ la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+.if \w == 4
+ li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+ j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ blt a3, a6, 1f
+
+ vmv.v.x v16, zero
+ addi t0, a2, 16
+ vle16.v v0, (t0)
+ vse16.v v16, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ li t1, 2896*8
+.ifc \variant, identity_
+ vsmul.vx v8, v0, t1
+ vsmul.vx v9, v1, t1
+ vsmul.vx v10, v2, t1
+ vsmul.vx v11, v3, t1
+ vsmul.vx v12, v4, t1
+ vsmul.vx v13, v5, t1
+ vsmul.vx v14, v6, t1
+ vsmul.vx v15, v7, t1
+.else
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+ jalr t0, a4
+
+ vssra.vi v8, v0, 1
+ vssra.vi v9, v1, 1
+ vssra.vi v10, v2, 1
+ vssra.vi v11, v3, 1
+ vssra.vi v12, v4, 1
+ vssra.vi v13, v5, 1
+ vssra.vi v14, v6, 1
+ vssra.vi v15, v7, 1
+.endif
+
+ j 2f
+
+1:
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+ vmv.v.x v\i, zero
+.endr
+
+2:
+ vmv.v.x v16, zero
+ vle16.v v0, (a2)
+ vse16.v v16, (a2)
+ addi t0, a2, 32
+ vle16.v v1, (t0)
+ vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v16, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+ j L(itx_8x16_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_8x16_epilog):
+ addi t4, sp, -8*32
+ vsseg8e16.v v0, (t4)
+ addi t0, t4, 8*16
+ vsseg8e16.v v8, (t0)
+
+ mv t5, a0
+ li t6, 16
+ jal a7, inv_txfm_add_vert_8x16_rvv
+
+ ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
+ csrw vxrm, zero
+
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle16.v v0, (a2)
+ addi t0, a2, 16
+ vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ addi t0, t0, 16
+ vle16.v v\i, (t0)
+.endr
+
+ li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+ li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vsmul.vx v16, v\i, t1
+ vssra.vi v16, v16, 1
+ vsadd.vv v\i, v\i, v16
+.endr
+
+ j L(itx_16x8_epilog)
+.else
+ jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x8_epilog):
+ li t0, 32
+ vssseg8e16.v v0, (a2), t0
+ addi t1, a2, 16
+ vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+ vsetivli zero, 8, e16, m1, ta, ma
+
+ vmv.v.x v8, zero
+ addi t0, a2, \j*2
+ vle16.v v0, (t0)
+ vse16.v v8, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+ addi t0, t0, 32
+ vle16.v v\i, (t0)
+ vse16.v v8, (t0)
+.endr
+
+ jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vssra.vi v\i, v\i, 4
+.endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ addi t0, a0, \j
+ vle8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vle8.v v\i, (t0)
+.endr
+
+ vwaddu.wv v0, v0, v8
+ vwaddu.wv v1, v1, v9
+ vwaddu.wv v2, v2, v10
+ vwaddu.wv v3, v3, v11
+ vwaddu.wv v4, v4, v12
+ vwaddu.wv v5, v5, v13
+ vwaddu.wv v6, v6, v14
+ vwaddu.wv v7, v7, v15
+
+ vsetvli zero, zero, e16, m1, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+ vmax.vx v\i, v\i, zero
+.endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+
+ vnclipu.wi v8, v0, 0
+ vnclipu.wi v9, v1, 0
+ vnclipu.wi v10, v2, 0
+ vnclipu.wi v11, v3, 0
+ vnclipu.wi v12, v4, 0
+ vnclipu.wi v13, v5, 0
+ vnclipu.wi v14, v6, 0
+ vnclipu.wi v15, v7, 0
+
+ addi t0, a0, \j
+ vse8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+ add t0, t0, a1
+ vse8.v v\i, (t0)
+.endr
+.endr
+
+ ret
+.endif
+endfunc
+.endm
+
+def_fn_816_base identity_
+def_fn_816_base
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.ifnc \txfm1, identity
+ la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+ la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.if \w == 8
+ li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+ j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+ j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
diff --git a/third_party/dav1d/src/riscv/asm.S b/third_party/dav1d/src/riscv/asm.S
index 2435170acb..eed4d67bf5 100644
--- a/third_party/dav1d/src/riscv/asm.S
+++ b/third_party/dav1d/src/riscv/asm.S
@@ -123,4 +123,6 @@ EXTERN\name:
end_thread_local
.endm
+#define L(x) .L ## x
+
#endif /* DAV1D_SRC_RISCV_ASM_S */
diff --git a/third_party/dav1d/src/riscv/itx.h b/third_party/dav1d/src/riscv/itx.h
index 28c5e54d42..d3f9a03a03 100644
--- a/third_party/dav1d/src/riscv/itx.h
+++ b/third_party/dav1d/src/riscv/itx.h
@@ -58,7 +58,13 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
decl_itx16_fns(16, 16, ext)
decl_itx_fns(rvv);
@@ -105,7 +111,13 @@ static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, in
#if BITDEPTH == 8
assign_itx17_fn( , 4, 4, rvv);
+ assign_itx16_fn(R, 4, 8, rvv);
+ assign_itx16_fn(R, 4, 16, rvv);
+ assign_itx16_fn(R, 8, 4, rvv);
assign_itx16_fn( , 8, 8, rvv);
+ assign_itx16_fn(R, 8, 16, rvv);
+ assign_itx16_fn(R, 16, 4, rvv);
+ assign_itx16_fn(R, 16, 8, rvv);
assign_itx12_fn( , 16, 16, rvv);
#endif
}
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
index 1f30f8a3b7..95d35fc1c8 100644
--- a/third_party/dav1d/src/x86/cdef_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -398,7 +398,6 @@ SECTION .text
INIT_YMM avx2
cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
pri, sec, dir, damping, edge
-%assign stack_offset_entry stack_offset
mov edged, edgem
cmp edged, 0xf
jne .border_block
@@ -1195,9 +1194,9 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
.border_block:
DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
-%define rstk rsp
-%assign stack_offset stack_offset_entry
-%assign regs_used 11
+ RESET_STACK_STATE
+ %assign stack_offset stack_offset - (regs_used - 11) * gprsize
+ %assign regs_used 11
ALLOC_STACK 2*16+(%2+4)*32, 16
%define px rsp+2*16+2*32
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
index a1d4c41f27..eda6035923 100644
--- a/third_party/dav1d/src/x86/filmgrain16_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -646,18 +646,9 @@ INIT_XMM avx2
INIT_YMM avx2
.ar2:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign xmm_regs_used 13 + %2
%assign stack_size_padded 136
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
-%if %2
- movaps [rsp+16*7], xmm13
-%endif
+ WIN64_PUSH_XMM 13 + %2, 8
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
@@ -747,20 +738,10 @@ INIT_YMM avx2
.ar3:
%if WIN64
- ; xmm6 and xmm7 already saved
%assign stack_offset 32
- %assign xmm_regs_used 14 + %2
%assign stack_size_padded 152
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
- movaps [rsp+16*7], xmm13
-%if %2
- movaps [rsp+16*8], xmm14
-%endif
+ WIN64_PUSH_XMM 14 + %2, 8
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
index 6b0daaac0b..25d01caa19 100644
--- a/third_party/dav1d/src/x86/filmgrain16_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -275,7 +275,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
.ar2:
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, bdmax, shift
@@ -428,7 +427,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
%elif ARCH_X86_64
%define tmp rsp+stack_offset-72
%else
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*12
%define tmp rsp
mov bdmaxd, bdmaxm
@@ -715,7 +713,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*2
mov bufyq, r1m
mov uvd, r3m
@@ -831,9 +828,7 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
%else
-%assign stack_offset stack_offset_old
-%xdefine rstk rsp
-%assign stack_size_padded 0
+ RESET_STACK_STATE
DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
mov bufyq, r1m
mov uvd, r3m
@@ -1159,7 +1154,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
%endif
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*14
mov bufyq, r1m
mov uvd, r3m
diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
index 55445cf593..91d8ca5c14 100644
--- a/third_party/dav1d/src/x86/filmgrain_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -204,18 +204,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
.ar2:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign xmm_regs_used 16
%assign stack_size_padded 168
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
- movaps [rsp+16*7], xmm13
- movaps [rsp+16*8], xmm14
- movaps [rsp+16*9], xmm15
+ WIN64_PUSH_XMM 16, 8
%endif
DEFINE_ARGS buf, fg_data, h, x
mov r6d, [fg_dataq+FGData.ar_coeff_shift]
@@ -287,15 +278,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
INIT_YMM avx2
.ar3:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign stack_offset 16
ALLOC_STACK 16*14
%assign stack_size stack_size - 16*4
- %assign xmm_regs_used 12
- movaps [rsp+16*12], xmm8
- movaps [rsp+16*13], xmm9
- movaps [rsp+16*14], xmm10
- movaps [rsp+16*15], xmm11
+ WIN64_PUSH_XMM 12, 8
%else
ALLOC_STACK 16*12
%endif
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
index 0172f98760..d06e349a8c 100644
--- a/third_party/dav1d/src/x86/filmgrain_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -232,7 +232,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
.ar2:
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, shift
@@ -333,7 +332,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
.ar3:
DEFINE_ARGS buf, fg_data, shift
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*14
%elif WIN64
SUB rsp, 16*6
@@ -601,7 +599,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -2*16
%endif
imul uvd, 28
@@ -738,9 +735,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar1:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
+ RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
imul uvd, 28
@@ -881,9 +876,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar2:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
ALLOC_STACK -8*16
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
@@ -1014,9 +1006,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar3:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
+ RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
index f4931e977b..7b52abaa10 100644
--- a/third_party/dav1d/src/x86/ipred16_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -946,7 +946,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w4_loop
RET
.w8:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
vpbroadcastw m0, [tlq] ; bottom
vbroadcasti128 m7, [tlq+hq*2+2]
@@ -974,7 +973,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w8_loop
RET
.w16:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 11
vpbroadcastw m0, [tlq] ; bottom
movu m7, [tlq+hq*2+2]
@@ -1005,7 +1003,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w16_loop
RET
.w32:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastw m0, [tlq] ; bottom
movu m7, [tlq+hq*2+ 2]
@@ -1047,7 +1044,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w32_loop
RET
.w64:
-%assign stack_offset stack_offset - stack_size_padded
PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
mov dst_baseq, dstq
mov tl_baseq, tlq
@@ -1104,7 +1100,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
RET
cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z1_16bpc_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
@@ -1312,7 +1307,6 @@ ALIGN function_align
.w4_end:
RET
.w8:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 7
lea r3d, [angleq+216]
mov r3b, hb
@@ -1476,7 +1470,6 @@ ALIGN function_align
or maxbased, 16 ; imin(h+15, 31)
jmp .w16_main
.w16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 7
lea maxbased, [hq+15]
test angled, 0x400
@@ -1622,7 +1615,6 @@ ALIGN function_align
.w16_end:
RET
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -160, 8
lea maxbased, [hq+31]
mov r3d, 63
@@ -1737,7 +1729,6 @@ ALIGN function_align
.w32_end:
RET
.w64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -256, 10
lea maxbased, [hq+63]
test angled, 0x400
@@ -2691,7 +2682,6 @@ ALIGN function_align
jmp .w32_filter_above
cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z3_16bpc_avx2_table]
tzcnt hd, hm
movifnidn angled, anglem
@@ -2907,7 +2897,6 @@ ALIGN function_align
RET
.h8:
lea r4d, [angleq+216]
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 8
mov r4b, wb
lea r7, [strideq*3]
@@ -3155,7 +3144,6 @@ ALIGN function_align
jmp .h16_main
ALIGN function_align
.h16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 10
lea maxbased, [wq+15]
lea r7, [strideq*3]
@@ -3372,7 +3360,6 @@ ALIGN function_align
.h16_end:
RET
.h32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -160, 9
lea maxbased, [wq+31]
and maxbased, 31
@@ -3557,7 +3544,6 @@ ALIGN function_align
.h32_end:
RET
.h64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -256, 10
lea maxbased, [wq+63]
test angled, 0x400
@@ -3804,7 +3790,6 @@ ALIGN function_align
; 5 8 8 i
cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
-%assign org_stack_offset stack_offset
%define base r6-ipred_filter_16bpc_avx2_table
lea r6, [filter_intra_taps]
tzcnt wd, wm
@@ -3846,7 +3831,6 @@ cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vbroadcasti128 m14, [base+filter_shuf3]
vpbroadcastw m15, r8m ; bitdepth_max
@@ -3883,7 +3867,6 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- %assign stack_offset stack_offset - stack_size_padded
ALLOC_STACK 32, 16
vpbroadcastw m15, r8m ; bitdepth_max
sub hd, 2
@@ -3977,7 +3960,6 @@ ALIGN function_align
ret
ALIGN function_align
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK 64, 16
vpbroadcastw m15, r8m ; bitdepth_max
sub hd, 2
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
index 58e40935ac..35738e7c0b 100644
--- a/third_party/dav1d/src/x86/ipred_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -772,7 +772,6 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 6
movu m3, [tlq+1]
punpcklbw m2, m3, m5
@@ -823,29 +822,17 @@ ALIGN function_align
jl .w64_loop
RET
-%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
- %assign stack_offset 0
- %assign stack_size_padded 0
- %assign regs_used %2
- %xdefine rstk rsp
- SETUP_STACK_POINTER %1
- %if regs_used != %2 && WIN64
- PUSH r%2
- %endif
- ALLOC_STACK %1, %3
-%endmacro
-
cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
-%define base r6-ipred_smooth_h_avx2_table
- lea r6, [ipred_smooth_h_avx2_table]
+%define base r5-ipred_smooth_h_avx2_table
+ lea r5, [ipred_smooth_h_avx2_table]
mov wd, wm
vpbroadcastb m3, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
- movsxd wq, [r6+wq*4]
+ movsxd wq, [r5+wq*4]
vpbroadcastd m4, [base+pb_127_m127]
vpbroadcastd m5, [base+pw_128]
- add wq, r6
+ add wq, r5
jmp wq
.w4:
WIN64_SPILL_XMM 8
@@ -891,7 +878,6 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
vbroadcasti128 m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
@@ -927,7 +913,7 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- SETUP_STACK_FRAME 32*4, 7, 8
+ ALLOC_STACK 32*4, 8
lea r3, [rsp+64*2-4]
call .prep ; only worthwhile for for w16 and above
sub tlq, 2
@@ -951,7 +937,7 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- SETUP_STACK_FRAME 32*4, 7, 6
+ ALLOC_STACK 32*4
lea r3, [rsp+64*2-2]
call .prep
dec tlq
@@ -971,19 +957,19 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- SETUP_STACK_FRAME 32*4, 7, 9
+ ALLOC_STACK 32*4, 9
lea r3, [rsp+64*2-2]
call .prep
- add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+ add r5, smooth_weights+16*15-ipred_smooth_h_avx2_table
dec tlq
- mova xm5, [r6-16*7]
- vinserti128 m5, [r6-16*5], 1
- mova xm6, [r6-16*6]
- vinserti128 m6, [r6-16*4], 1
- mova xm7, [r6-16*3]
- vinserti128 m7, [r6-16*1], 1
- mova xm8, [r6-16*2]
- vinserti128 m8, [r6-16*0], 1
+ mova xm5, [r5-16*7]
+ vinserti128 m5, [r5-16*5], 1
+ mova xm6, [r5-16*6]
+ vinserti128 m6, [r5-16*4], 1
+ mova xm7, [r5-16*3]
+ vinserti128 m7, [r5-16*1], 1
+ mova xm8, [r5-16*2]
+ vinserti128 m8, [r5-16*0], 1
.w64_loop:
vpbroadcastb m2, [tlq+hq]
punpcklbw m2, m3
@@ -1113,7 +1099,6 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vbroadcasti128 m11, [base+smooth_weights+8*2]
@@ -1157,7 +1142,9 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- SETUP_STACK_FRAME 32*4, 7, 14
+ %assign regs_used 4
+ ALLOC_STACK -32*4, 14
+ %assign regs_used 7
vbroadcasti128 m11, [tlq+1]
lea r3, [rsp+64*2-4]
punpcklbw m10, m11, m0 ; top, bottom
@@ -1197,7 +1184,9 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- SETUP_STACK_FRAME 32*4, 7, 11
+ %assign regs_used 4
+ ALLOC_STACK -32*4, 11
+ %assign regs_used 7
movu m8, [tlq+1]
lea r3, [rsp+64*2-2]
punpcklbw m7, m8, m0
@@ -1232,7 +1221,9 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- SETUP_STACK_FRAME 32*8, 7, 16
+ %assign regs_used 4
+ ALLOC_STACK -32*8, 16
+ %assign regs_used 7
movu m13, [tlq+1 ]
movu m15, [tlq+33]
add r6, smooth_weights+16*15-ipred_smooth_avx2_table
@@ -1316,7 +1307,6 @@ ALIGN function_align
ret
cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z1_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
@@ -1415,7 +1405,6 @@ ALIGN function_align
pmovmskb r5d, m1
ret
.w4_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -16, 11
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
@@ -1522,7 +1511,6 @@ ALIGN function_align
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
movu xm2, [z_filter_s+6]
mova xm0, [tlq-1]
@@ -1592,7 +1580,6 @@ ALIGN function_align
or maxbased, 8 ; imin(h+7, 15)
jmp .w8_main
.w8_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [hq+7]
test angled, 0x400
@@ -1696,7 +1683,6 @@ ALIGN function_align
jmp .w16_main
ALIGN function_align
.w16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [hq+15]
test angled, 0x400
@@ -1816,7 +1802,6 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea r3d, [hq+31]
mov maxbased, 63
@@ -1960,7 +1945,6 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -128, 16
lea maxbased, [hq+63]
test angled, 0x400 ; !enable_intra_edge_filter
@@ -3001,7 +2985,6 @@ ALIGN function_align
jmp .w32_filter_above
cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z3_avx2_table]
tzcnt hd, hm
movifnidn angled, anglem
@@ -3102,7 +3085,6 @@ ALIGN function_align
pmovmskb r5d, m1
ret
.h4_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -16, 12
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
@@ -3215,7 +3197,6 @@ ALIGN function_align
mov r4b, wb
cmp r4d, 8
ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
and r4d, 4
mova xm0, [tlq-15]
@@ -3297,7 +3278,6 @@ ALIGN function_align
or maxbased, 8 ; imin(w+7, 15)
jmp .h8_main
.h8_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [wq+7]
test angled, 0x400
@@ -3455,7 +3435,6 @@ ALIGN function_align
jmp .h16_main
ALIGN function_align
.h16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [wq+15]
test angled, 0x400
@@ -3661,7 +3640,6 @@ ALIGN function_align
RET
ALIGN function_align
.h32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea maxbased, [wq+31]
and maxbased, 31
@@ -3890,7 +3868,6 @@ ALIGN function_align
RET
ALIGN function_align
.h64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -128, 16
lea maxbased, [wq+63]
test angled, 0x400 ; !enable_intra_edge_filter
@@ -4221,6 +4198,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
movzx filterd, byte filterm
%endif
shl filterd, 6
+ WIN64_SPILL_XMM 9, 15
add filterq, r6
lea r6, [ipred_filter_avx2_table]
movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
@@ -4234,7 +4212,6 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
mov hd, hm
jmp wq
.w4:
- WIN64_SPILL_XMM 9
mova xm8, [base+filter_shuf2]
sub tlq, 3
sub tlq, hq
@@ -4251,8 +4228,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 10
+ WIN64_PUSH_XMM 10
mova m8, [base+filter_shuf1]
FILTER_XMM 7, 0, 6, [base+filter_shuf2]
vpbroadcastd m0, [tlq+4]
@@ -4278,26 +4254,18 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
-%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
- %assign xmm_regs_used 15
- %assign stack_size_padded 0x98
- SUB rsp, stack_size_padded
-%endif
sub hd, 2
- TAIL_CALL .w16_main, 0
-.w16_main:
+ call .w16_main
%if WIN64
- movaps [rsp+0xa8], xmm6
- movaps [rsp+0xb8], xmm7
- movaps [rsp+0x28], xmm8
- movaps [rsp+0x38], xmm9
- movaps [rsp+0x48], xmm10
- movaps [rsp+0x58], xmm11
- movaps [rsp+0x68], xmm12
- movaps [rsp+0x78], xmm13
- movaps [rsp+0x88], xmm14
+ jmp .end
+%else
+ RET
%endif
+.w16_main:
+ ; The spills are into the callers stack frame
+ %assign stack_size stack_size + gprsize
+ WIN64_PUSH_XMM 15, 9
+ %assign stack_size stack_size - gprsize
FILTER_XMM 12, 0, 7, [base+filter_shuf2]
vpbroadcastd m0, [tlq+5]
vpblendd m0, [tlq-12], 0x14
@@ -4350,7 +4318,6 @@ ALIGN function_align
ret
ALIGN function_align
.w32:
- sub rsp, stack_size_padded
sub hd, 2
lea r3, [dstq+16]
lea r5d, [hq-2]
@@ -4415,6 +4382,7 @@ ALIGN function_align
shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm6
+.end:
RET
ALIGN function_align
.main:
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
index 976f33a24b..f6b0cad001 100644
--- a/third_party/dav1d/src/x86/ipred_sse.asm
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -670,10 +670,7 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
-%if WIN64
- movaps [rsp+24], xmm7
- %define xmm_regs_used 8
-%endif
+ WIN64_PUSH_XMM 8, 7
mova m7, m5
.w32_loop_init:
mov r3d, 2
@@ -705,10 +702,7 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
-%if WIN64
- movaps [rsp+24], xmm7
- %define xmm_regs_used 8
-%endif
+ WIN64_PUSH_XMM 8, 7
mova m7, m5
.w64_loop_init:
mov r3d, 4
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
index 01eb6fa348..b5c73a51d4 100644
--- a/third_party/dav1d/src/x86/looprestoration_sse.asm
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -42,7 +42,6 @@ pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_right_ext_mask: times 24 db 0xff
times 8 db 0
pb_1: times 16 db 1
-pb_3: times 16 db 3
pw_256: times 8 dw 256
pw_2056: times 8 dw 2056
pw_m16380: times 8 dw -16380
@@ -290,7 +289,7 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
- movd m2, [lpfq-4]
+ movd m2, [lpfq-1]
%if ARCH_X86_64
push r0
lea r0, [pb_right_ext_mask+21]
@@ -302,10 +301,11 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
movu m1, [r6+xq+8]
%endif
%if cpuflag(ssse3)
- pshufb m2, [base+pb_3]
+ pxor m3, m3
+ pshufb m2, m3
%else
punpcklbw m2, m2
- pshuflw m2, m2, q3333
+ pshuflw m2, m2, q0000
punpcklqdq m2, m2
%endif
pand m4, m0
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
index 61eeaa1007..42e2a5525e 100644
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -1337,7 +1337,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp wd, 4
je .h_w4
jl .h_w2
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 13
shr mxd, 16
sub srcq, 6
@@ -1415,7 +1414,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp hd, 4
cmovle myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastd m6, [pd_32]
vpbroadcastw m7, r8m
@@ -1590,7 +1588,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .v_w8_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vpbroadcastw m15, r8m
cmp wd, 4
@@ -2046,7 +2043,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
shr mxd, 16
sub srcq, 6
vpbroadcastq m0, [base+subpel_filters+mxq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
vbroadcasti128 m6, [subpel_h_shufA]
vbroadcasti128 m7, [subpel_h_shufB]
@@ -2125,7 +2121,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
cmp hd, 4
cmovle myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastd m7, [prep_8tap_1d_rnd]
lea r6, [strideq*3]
@@ -2264,7 +2259,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
%endif
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vpbroadcastd m15, [prep_8tap_2d_rnd]
cmp wd, 4
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
index 585ba53e08..e5de7ecd96 100644
--- a/third_party/dav1d/src/x86/mc16_avx512.asm
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -2377,7 +2377,6 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
jg .hv_w16_loop
RET
.hv_w32:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 32
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
@@ -3175,7 +3174,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
jg .hv_w8_loop
RET
.hv_w16:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 27
vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
@@ -3313,7 +3311,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
RET
.hv_w32:
%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
PUSH r8
%assign regs_used regs_used + 1
WIN64_SPILL_XMM 32
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
index fde8e372a3..b0c42597f7 100644
--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -1302,10 +1302,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .h_w4_loop
RET
.h_w8:
-%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
-%endif
shr mxd, 16
movq m3, [base+subpel_filters+mxq*8]
movifnidn dstq, dstmp
@@ -1383,14 +1380,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp hd, 6
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
-%if WIN64
WIN64_SPILL_XMM 15
-%endif
movd m7, r8m
movifnidn dstq, dstmp
movifnidn dsq, dsmp
@@ -1604,11 +1594,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .v_w4_loop0
RET
.hv:
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
%if ARCH_X86_32
movd m4, r8m
mova m6, [base+pd_512]
@@ -1750,11 +1736,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
%if ARCH_X86_32
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
mov dstq, dstmp
mov dsq, dsmp
mova m0, [base+spel_h_shufA]
@@ -2182,11 +2164,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
cmp hd, 4
cmove myd, mxd
movq m3, [base+subpel_filters+myq*8]
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
WIN64_SPILL_XMM 15
movddup m7, [base+prep_8tap_1d_rnd]
movifnidn ssq, r2mp
@@ -2339,11 +2316,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
jg .v_loop0
RET
.hv:
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
movzx t3d, mxb
shr mxd, 16
cmp wd, 4
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
index 3b208033bd..58e3cb5af1 100644
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -1259,7 +1259,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
@@ -1620,7 +1619,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_loop
RET
.v:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
@@ -1834,7 +1832,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .v_w16_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
cmp wd, 4
jg .hv_w8
@@ -2247,7 +2244,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .h_loop
RET
.v:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
@@ -2430,8 +2426,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w16_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
WIN64_SPILL_XMM 16
cmp wd, 4
je .hv_w4
@@ -4108,10 +4102,9 @@ cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
beta, filter, tmp1, delta, my, gamma
%if WIN64
- sub rsp, 0xa0
%assign xmm_regs_used 16
%assign stack_size_padded 0xa0
- %assign stack_offset stack_offset+stack_size_padded
+ SUB rsp, stack_size_padded
%endif
call .main
jmp .start
@@ -4134,21 +4127,13 @@ cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha,
RET
ALIGN function_align
.main:
- ; Stack args offset by one (r4m -> r5m etc.) due to call
-%if WIN64
- mov abcdq, r5m
- mov mxd, r6m
- movaps [rsp+stack_offset+0x10], xmm6
- movaps [rsp+stack_offset+0x20], xmm7
- movaps [rsp+0x28], xmm8
- movaps [rsp+0x38], xmm9
- movaps [rsp+0x48], xmm10
- movaps [rsp+0x58], xmm11
- movaps [rsp+0x68], xmm12
- movaps [rsp+0x78], xmm13
- movaps [rsp+0x88], xmm14
- movaps [rsp+0x98], xmm15
-%endif
+ ; Stack is offset due to call
+ %assign stack_offset stack_offset + gprsize
+ %assign stack_size stack_size + gprsize
+ %assign stack_size_padded stack_size_padded + gprsize
+ movifnidn abcdq, abcdmp
+ movifnidn mxd, mxm
+ WIN64_PUSH_XMM
movsx alphad, word [abcdq+2*0]
movsx betad, word [abcdq+2*1]
mova m12, [warp_8x8_shufA]
@@ -4162,7 +4147,7 @@ ALIGN function_align
lea tmp2d, [alphaq*3]
sub srcq, tmp1q ; src -= src_stride*3 + 3
sub betad, tmp2d ; beta -= alpha*3
- mov myd, r7m
+ mov myd, r6m
call .h
psrld m1, m0, 16
call .h
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index 7897f1decc..f9043f1ad3 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -1276,7 +1276,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
@@ -2853,8 +2852,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
WIN64_SPILL_XMM 16
cmp wd, 4
je .hv_w4
diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm
index 54939c647a..a447a80161 100644
--- a/third_party/dav1d/src/x86/mc_sse.asm
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@@ -1199,7 +1199,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
RET
.v:
%if notcpuflag(ssse3)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
%endif
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
@@ -1375,7 +1374,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
-%assign stack_offset stack_offset - stack_size_padded
%if cpuflag(ssse3)
imul mxyd, 0x08000800
WIN64_SPILL_XMM 8
@@ -1592,7 +1590,6 @@ FN put_8tap, regular, REGULAR, REGULAR
%endif
cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
-%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
%if ARCH_X86_64
@@ -1618,7 +1615,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movzx wd, word [base_reg+wq*2+table_offset(put,)]
add wq, base_reg
; put_bilin mangling jump
-%assign stack_offset org_stack_offset
movifnidn dsq, dsmp
movifnidn ssq, ssmp
%if WIN64
@@ -1792,7 +1788,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
- %assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
@@ -2048,7 +2043,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%undef subpel2
%undef subpel3
.hv:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
cmp wd, 4
jg .hv_w8
%if ARCH_X86_32
@@ -2369,7 +2364,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%undef subpelv2
%undef subpelv3
.hv_w8:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
%define hv8_line_1 0
%define hv8_line_2 1
%define hv8_line_3 2
@@ -2843,7 +2838,6 @@ FN prep_8tap, regular, REGULAR, REGULAR
%define base 0
%endif
cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
-%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
@@ -2862,7 +2856,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
- %assign stack_offset org_stack_offset
%if WIN64
pop r8
pop r7
@@ -3095,7 +3088,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mov mxd, myd
and mxd, 0x7f
%else
- %assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
%endif
@@ -3359,7 +3351,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpel2
%undef subpel3
.hv:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
cmp wd, 4
jg .hv_w8
and mxd, 0x7f
@@ -3659,7 +3651,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpelv2
%undef subpelv3
.hv_w8:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
%define hv8_line_1 0
%define hv8_line_2 1
%define hv8_line_3 2
diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm
index 9f05c921a6..4156efe914 100644
--- a/third_party/dav1d/src/x86/msac.asm
+++ b/third_party/dav1d/src/x86/msac.asm
@@ -143,10 +143,9 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov esp, [esp]
%endif
%endif
- not t4
sub t2d, t1d ; rng
shl t1, gprsize*8-16
- add t4, t1 ; ~dif
+ sub t4, t1 ; dif - v
.renorm3:
mov t1d, [t0+msac.cnt]
movifnidn t7, t0
@@ -157,33 +156,31 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
shl t2d, cl
shl t4, cl
mov [t7+msac.rng], t2d
- not t4
sub t1d, ecx
jae .end ; no refill required
; refill:
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if ARCH_X86_64 == 0
push t5
%endif
- lea t5, [t2+gprsize]
- cmp t5, rcx
+ mov t2, [t7+msac.buf]
+ mov t5, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t5
ja .refill_eob
- mov t2, [t2]
- lea ecx, [t1+23]
- add t1d, 16
- shr ecx, 3 ; shift_bytes
- bswap t2
- sub t5, rcx
- shl ecx, 3 ; shift_bits
- shr t2, cl
- sub ecx, t1d ; shift_bits - 16 - cnt
- mov t1d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t5
- sub t1d, ecx ; cnt + gprsize*8 - shift_bits
- xor t4, t2
+ mov t5, [t2]
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ shr ecx, 3 ; num_bytes_read
+ or t4, t5
+.refill_end:
+ add t2, rcx
+ lea t1d, [t1+rcx*8] ; cnt += num_bits_read
+ mov [t7+msac.buf], t2
+.refill_end2:
%if ARCH_X86_64 == 0
pop t5
%endif
@@ -191,29 +188,35 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov [t7+msac.cnt], t1d
mov [t7+msac.dif], t4
RET
+.pad_with_ones:
+ lea ecx, [t1-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp .refill_end2
.refill_eob: ; avoid overreading the input buffer
- mov t5, rcx
- mov ecx, gprsize*8-24
- sub ecx, t1d ; c
-.refill_eob_loop:
cmp t2, t5
- jae .refill_eob_end ; eob reached
- movzx t1d, byte [t2]
- inc t2
- shl t1, cl
- xor t4, t1
- sub ecx, 8
- jge .refill_eob_loop
-.refill_eob_end:
- mov t1d, gprsize*8-24
-%if ARCH_X86_64 == 0
- pop t5
-%endif
- sub t1d, ecx
- mov [t7+msac.buf], t2
- mov [t7+msac.dif], t4
- mov [t7+msac.cnt], t1d
- RET
+ jae .pad_with_ones ; eob reached
+ ; We can safely do a register-sized load of the last bytes of the buffer
+ ; as this code is only reached if the msac buffer size is >= gprsize.
+ mov t5, [t5-gprsize]
+ shl ecx, 3
+ shr t5, cl
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ or t4, t5
+ mov t5d, [t7+msac.end]
+ shr ecx, 3
+ sub t5d, t2d ; num_bytes_left
+ cmp ecx, t5d
+ cmovae ecx, t5d ; num_bytes_read
+ jmp .refill_end
cglobal msac_decode_symbol_adapt8, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
@@ -366,7 +369,6 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
%if ARCH_X86_64 == 0
movzx eax, al
%endif
- not t4
test t3d, t3d
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%if UNIX64 == 0
@@ -420,7 +422,6 @@ cglobal msac_decode_bool_equi, 0, 6, 0
mov ecx, 0xbfff
setb al ; the upper 32 bits contains garbage but that's OK
sub ecx, t2d
- not t4
; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
; i.e. (0 <= d <= 2) and v < (3 << 14)
shr ecx, 14 ; d
@@ -447,7 +448,6 @@ cglobal msac_decode_bool, 0, 6, 0
cmovb t2d, t1d
cmovb t4, t3
setb al
- not t4
%if ARCH_X86_64 == 0
movzx eax, al
%endif
@@ -497,48 +497,45 @@ cglobal msac_decode_bool, 0, 6, 0
tzcnt eax, eax
movzx ecx, word [buf+rax+16]
movzx t2d, word [buf+rax+14]
- not t4
%if ARCH_X86_64
add t6d, 5
%endif
sub eax, 5 ; setup for merging the tok_br and tok branches
sub t2d, ecx
shl rcx, gprsize*8-16
- add t4, rcx
+ sub t4, rcx
bsr ecx, t2d
xor ecx, 15
shl t2d, cl
shl t4, cl
movd m2, t2d
mov [t7+msac.rng], t2d
- not t4
sub t5d, ecx
jae %%end
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if UNIX64 == 0
push t8
%endif
- lea t8, [t2+gprsize]
- cmp t8, rcx
+ mov t2, [t7+msac.buf]
+ mov t8, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t8
ja %%refill_eob
- mov t2, [t2]
- lea ecx, [t5+23]
- add t5d, 16
+ mov t8, [t2]
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
shr ecx, 3
- bswap t2
- sub t8, rcx
- shl ecx, 3
- shr t2, cl
- sub ecx, t5d
- mov t5d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t8
+ or t4, t8
+%%refill_end:
+ add t2, rcx
+ lea t5d, [t5+rcx*8]
+ mov [t7+msac.buf], t2
+%%refill_end2:
%if UNIX64 == 0
pop t8
%endif
- sub t5d, ecx
- xor t4, t2
%%end:
movp m3, t4
%if ARCH_X86_64
@@ -559,27 +556,34 @@ cglobal msac_decode_bool, 0, 6, 0
shr eax, 1
mov [t7+msac.cnt], t5d
RET
+%%pad_with_ones:
+ ; ensure that dif is padded with at least 15 bits of ones at the end
+ lea ecx, [t5-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp %%refill_end2
%%refill_eob:
- mov t8, rcx
- mov ecx, gprsize*8-24
- sub ecx, t5d
-%%refill_eob_loop:
cmp t2, t8
- jae %%refill_eob_end
- movzx t5d, byte [t2]
- inc t2
- shl t5, cl
- xor t4, t5
- sub ecx, 8
- jge %%refill_eob_loop
-%%refill_eob_end:
-%if UNIX64 == 0
- pop t8
-%endif
- mov t5d, gprsize*8-24
- mov [t7+msac.buf], t2
- sub t5d, ecx
- jmp %%end
+ jae %%pad_with_ones
+ mov t8, [t8-gprsize]
+ shl ecx, 3
+ shr t8, cl
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
+ or t4, t8
+ mov t8d, [t7+msac.end]
+ shr ecx, 3
+ sub t8d, t2d
+ cmp ecx, t8d
+ cmovae ecx, t8d
+ jmp %%refill_end
%endmacro
cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6