summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:35:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:35:49 +0000
commitd8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
treeeff41dc61d9f714852212739e6b3738b82a2af87 /third_party/dav1d/src/x86
parentReleasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
downloadfirefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz
firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86')
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm7
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx2.asm23
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm8
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx2.asm19
-rw-r--r--third_party/dav1d/src/x86/filmgrain_sse.asm14
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm18
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm106
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm10
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm8
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm6
-rw-r--r--third_party/dav1d/src/x86/mc16_avx512.asm3
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm33
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm33
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm3
-rw-r--r--third_party/dav1d/src/x86/mc_sse.asm16
-rw-r--r--third_party/dav1d/src/x86/msac.asm172
16 files changed, 157 insertions, 322 deletions
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
index 1f30f8a3b7..95d35fc1c8 100644
--- a/third_party/dav1d/src/x86/cdef_avx2.asm
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -398,7 +398,6 @@ SECTION .text
INIT_YMM avx2
cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
pri, sec, dir, damping, edge
-%assign stack_offset_entry stack_offset
mov edged, edgem
cmp edged, 0xf
jne .border_block
@@ -1195,9 +1194,9 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
.border_block:
DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
-%define rstk rsp
-%assign stack_offset stack_offset_entry
-%assign regs_used 11
+ RESET_STACK_STATE
+ %assign stack_offset stack_offset - (regs_used - 11) * gprsize
+ %assign regs_used 11
ALLOC_STACK 2*16+(%2+4)*32, 16
%define px rsp+2*16+2*32
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
index a1d4c41f27..eda6035923 100644
--- a/third_party/dav1d/src/x86/filmgrain16_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -646,18 +646,9 @@ INIT_XMM avx2
INIT_YMM avx2
.ar2:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign xmm_regs_used 13 + %2
%assign stack_size_padded 136
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
-%if %2
- movaps [rsp+16*7], xmm13
-%endif
+ WIN64_PUSH_XMM 13 + %2, 8
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
@@ -747,20 +738,10 @@ INIT_YMM avx2
.ar3:
%if WIN64
- ; xmm6 and xmm7 already saved
%assign stack_offset 32
- %assign xmm_regs_used 14 + %2
%assign stack_size_padded 152
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
- movaps [rsp+16*7], xmm13
-%if %2
- movaps [rsp+16*8], xmm14
-%endif
+ WIN64_PUSH_XMM 14 + %2, 8
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
index 6b0daaac0b..25d01caa19 100644
--- a/third_party/dav1d/src/x86/filmgrain16_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -275,7 +275,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
.ar2:
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, bdmax, shift
@@ -428,7 +427,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
%elif ARCH_X86_64
%define tmp rsp+stack_offset-72
%else
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*12
%define tmp rsp
mov bdmaxd, bdmaxm
@@ -715,7 +713,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*2
mov bufyq, r1m
mov uvd, r3m
@@ -831,9 +828,7 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
%if ARCH_X86_64
DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
%else
-%assign stack_offset stack_offset_old
-%xdefine rstk rsp
-%assign stack_size_padded 0
+ RESET_STACK_STATE
DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
mov bufyq, r1m
mov uvd, r3m
@@ -1159,7 +1154,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
%endif
%else
DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*14
mov bufyq, r1m
mov uvd, r3m
diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
index 55445cf593..91d8ca5c14 100644
--- a/third_party/dav1d/src/x86/filmgrain_avx2.asm
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -204,18 +204,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
.ar2:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign xmm_regs_used 16
%assign stack_size_padded 168
SUB rsp, stack_size_padded
- movaps [rsp+16*2], xmm8
- movaps [rsp+16*3], xmm9
- movaps [rsp+16*4], xmm10
- movaps [rsp+16*5], xmm11
- movaps [rsp+16*6], xmm12
- movaps [rsp+16*7], xmm13
- movaps [rsp+16*8], xmm14
- movaps [rsp+16*9], xmm15
+ WIN64_PUSH_XMM 16, 8
%endif
DEFINE_ARGS buf, fg_data, h, x
mov r6d, [fg_dataq+FGData.ar_coeff_shift]
@@ -287,15 +278,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
INIT_YMM avx2
.ar3:
%if WIN64
- ; xmm6 and xmm7 already saved
- %assign stack_offset 16
ALLOC_STACK 16*14
%assign stack_size stack_size - 16*4
- %assign xmm_regs_used 12
- movaps [rsp+16*12], xmm8
- movaps [rsp+16*13], xmm9
- movaps [rsp+16*14], xmm10
- movaps [rsp+16*15], xmm11
+ WIN64_PUSH_XMM 12, 8
%else
ALLOC_STACK 16*12
%endif
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
index 0172f98760..d06e349a8c 100644
--- a/third_party/dav1d/src/x86/filmgrain_sse.asm
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -232,7 +232,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
.ar2:
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -16*8
%endif
DEFINE_ARGS buf, fg_data, shift
@@ -333,7 +332,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
.ar3:
DEFINE_ARGS buf, fg_data, shift
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
ALLOC_STACK -16*14
%elif WIN64
SUB rsp, 16*6
@@ -601,7 +599,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
%if ARCH_X86_32
-%assign stack_offset_old stack_offset
ALLOC_STACK -2*16
%endif
imul uvd, 28
@@ -738,9 +735,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar1:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
+ RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
imul uvd, 28
@@ -881,9 +876,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar2:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
ALLOC_STACK -8*16
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
@@ -1014,9 +1006,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat
.ar3:
%if ARCH_X86_32
-%assign stack_offset stack_offset_old
-%assign stack_size_padded 0
-%xdefine rstk rsp
+ RESET_STACK_STATE
%endif
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
movifnidn bufyq, bufymp
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
index f4931e977b..7b52abaa10 100644
--- a/third_party/dav1d/src/x86/ipred16_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -946,7 +946,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w4_loop
RET
.w8:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
vpbroadcastw m0, [tlq] ; bottom
vbroadcasti128 m7, [tlq+hq*2+2]
@@ -974,7 +973,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w8_loop
RET
.w16:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 11
vpbroadcastw m0, [tlq] ; bottom
movu m7, [tlq+hq*2+2]
@@ -1005,7 +1003,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w16_loop
RET
.w32:
-%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastw m0, [tlq] ; bottom
movu m7, [tlq+hq*2+ 2]
@@ -1047,7 +1044,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
jg .w32_loop
RET
.w64:
-%assign stack_offset stack_offset - stack_size_padded
PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
mov dst_baseq, dstq
mov tl_baseq, tlq
@@ -1104,7 +1100,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
RET
cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z1_16bpc_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
@@ -1312,7 +1307,6 @@ ALIGN function_align
.w4_end:
RET
.w8:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 7
lea r3d, [angleq+216]
mov r3b, hb
@@ -1476,7 +1470,6 @@ ALIGN function_align
or maxbased, 16 ; imin(h+15, 31)
jmp .w16_main
.w16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 7
lea maxbased, [hq+15]
test angled, 0x400
@@ -1622,7 +1615,6 @@ ALIGN function_align
.w16_end:
RET
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -160, 8
lea maxbased, [hq+31]
mov r3d, 63
@@ -1737,7 +1729,6 @@ ALIGN function_align
.w32_end:
RET
.w64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -256, 10
lea maxbased, [hq+63]
test angled, 0x400
@@ -2691,7 +2682,6 @@ ALIGN function_align
jmp .w32_filter_above
cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z3_16bpc_avx2_table]
tzcnt hd, hm
movifnidn angled, anglem
@@ -2907,7 +2897,6 @@ ALIGN function_align
RET
.h8:
lea r4d, [angleq+216]
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 8
mov r4b, wb
lea r7, [strideq*3]
@@ -3155,7 +3144,6 @@ ALIGN function_align
jmp .h16_main
ALIGN function_align
.h16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 10
lea maxbased, [wq+15]
lea r7, [strideq*3]
@@ -3372,7 +3360,6 @@ ALIGN function_align
.h16_end:
RET
.h32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -160, 9
lea maxbased, [wq+31]
and maxbased, 31
@@ -3557,7 +3544,6 @@ ALIGN function_align
.h32_end:
RET
.h64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -256, 10
lea maxbased, [wq+63]
test angled, 0x400
@@ -3804,7 +3790,6 @@ ALIGN function_align
; 5 8 8 i
cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
-%assign org_stack_offset stack_offset
%define base r6-ipred_filter_16bpc_avx2_table
lea r6, [filter_intra_taps]
tzcnt wd, wm
@@ -3846,7 +3831,6 @@ cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vbroadcasti128 m14, [base+filter_shuf3]
vpbroadcastw m15, r8m ; bitdepth_max
@@ -3883,7 +3867,6 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- %assign stack_offset stack_offset - stack_size_padded
ALLOC_STACK 32, 16
vpbroadcastw m15, r8m ; bitdepth_max
sub hd, 2
@@ -3977,7 +3960,6 @@ ALIGN function_align
ret
ALIGN function_align
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK 64, 16
vpbroadcastw m15, r8m ; bitdepth_max
sub hd, 2
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
index 58e40935ac..35738e7c0b 100644
--- a/third_party/dav1d/src/x86/ipred_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -772,7 +772,6 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 6
movu m3, [tlq+1]
punpcklbw m2, m3, m5
@@ -823,29 +822,17 @@ ALIGN function_align
jl .w64_loop
RET
-%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
- %assign stack_offset 0
- %assign stack_size_padded 0
- %assign regs_used %2
- %xdefine rstk rsp
- SETUP_STACK_POINTER %1
- %if regs_used != %2 && WIN64
- PUSH r%2
- %endif
- ALLOC_STACK %1, %3
-%endmacro
-
cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
-%define base r6-ipred_smooth_h_avx2_table
- lea r6, [ipred_smooth_h_avx2_table]
+%define base r5-ipred_smooth_h_avx2_table
+ lea r5, [ipred_smooth_h_avx2_table]
mov wd, wm
vpbroadcastb m3, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
- movsxd wq, [r6+wq*4]
+ movsxd wq, [r5+wq*4]
vpbroadcastd m4, [base+pb_127_m127]
vpbroadcastd m5, [base+pw_128]
- add wq, r6
+ add wq, r5
jmp wq
.w4:
WIN64_SPILL_XMM 8
@@ -891,7 +878,6 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
vbroadcasti128 m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
@@ -927,7 +913,7 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- SETUP_STACK_FRAME 32*4, 7, 8
+ ALLOC_STACK 32*4, 8
lea r3, [rsp+64*2-4]
call .prep ; only worthwhile for for w16 and above
sub tlq, 2
@@ -951,7 +937,7 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- SETUP_STACK_FRAME 32*4, 7, 6
+ ALLOC_STACK 32*4
lea r3, [rsp+64*2-2]
call .prep
dec tlq
@@ -971,19 +957,19 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- SETUP_STACK_FRAME 32*4, 7, 9
+ ALLOC_STACK 32*4, 9
lea r3, [rsp+64*2-2]
call .prep
- add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+ add r5, smooth_weights+16*15-ipred_smooth_h_avx2_table
dec tlq
- mova xm5, [r6-16*7]
- vinserti128 m5, [r6-16*5], 1
- mova xm6, [r6-16*6]
- vinserti128 m6, [r6-16*4], 1
- mova xm7, [r6-16*3]
- vinserti128 m7, [r6-16*1], 1
- mova xm8, [r6-16*2]
- vinserti128 m8, [r6-16*0], 1
+ mova xm5, [r5-16*7]
+ vinserti128 m5, [r5-16*5], 1
+ mova xm6, [r5-16*6]
+ vinserti128 m6, [r5-16*4], 1
+ mova xm7, [r5-16*3]
+ vinserti128 m7, [r5-16*1], 1
+ mova xm8, [r5-16*2]
+ vinserti128 m8, [r5-16*0], 1
.w64_loop:
vpbroadcastb m2, [tlq+hq]
punpcklbw m2, m3
@@ -1113,7 +1099,6 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vbroadcasti128 m11, [base+smooth_weights+8*2]
@@ -1157,7 +1142,9 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
- SETUP_STACK_FRAME 32*4, 7, 14
+ %assign regs_used 4
+ ALLOC_STACK -32*4, 14
+ %assign regs_used 7
vbroadcasti128 m11, [tlq+1]
lea r3, [rsp+64*2-4]
punpcklbw m10, m11, m0 ; top, bottom
@@ -1197,7 +1184,9 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- SETUP_STACK_FRAME 32*4, 7, 11
+ %assign regs_used 4
+ ALLOC_STACK -32*4, 11
+ %assign regs_used 7
movu m8, [tlq+1]
lea r3, [rsp+64*2-2]
punpcklbw m7, m8, m0
@@ -1232,7 +1221,9 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- SETUP_STACK_FRAME 32*8, 7, 16
+ %assign regs_used 4
+ ALLOC_STACK -32*8, 16
+ %assign regs_used 7
movu m13, [tlq+1 ]
movu m15, [tlq+33]
add r6, smooth_weights+16*15-ipred_smooth_avx2_table
@@ -1316,7 +1307,6 @@ ALIGN function_align
ret
cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z1_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
@@ -1415,7 +1405,6 @@ ALIGN function_align
pmovmskb r5d, m1
ret
.w4_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -16, 11
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
@@ -1522,7 +1511,6 @@ ALIGN function_align
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
movu xm2, [z_filter_s+6]
mova xm0, [tlq-1]
@@ -1592,7 +1580,6 @@ ALIGN function_align
or maxbased, 8 ; imin(h+7, 15)
jmp .w8_main
.w8_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [hq+7]
test angled, 0x400
@@ -1696,7 +1683,6 @@ ALIGN function_align
jmp .w16_main
ALIGN function_align
.w16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [hq+15]
test angled, 0x400
@@ -1816,7 +1802,6 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea r3d, [hq+31]
mov maxbased, 63
@@ -1960,7 +1945,6 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -128, 16
lea maxbased, [hq+63]
test angled, 0x400 ; !enable_intra_edge_filter
@@ -3001,7 +2985,6 @@ ALIGN function_align
jmp .w32_filter_above
cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
- %assign org_stack_offset stack_offset
lea r6, [ipred_z3_avx2_table]
tzcnt hd, hm
movifnidn angled, anglem
@@ -3102,7 +3085,6 @@ ALIGN function_align
pmovmskb r5d, m1
ret
.h4_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -16, 12
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
@@ -3215,7 +3197,6 @@ ALIGN function_align
mov r4b, wb
cmp r4d, 8
ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
and r4d, 4
mova xm0, [tlq-15]
@@ -3297,7 +3278,6 @@ ALIGN function_align
or maxbased, 8 ; imin(w+7, 15)
jmp .h8_main
.h8_no_upsample:
- %assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [wq+7]
test angled, 0x400
@@ -3455,7 +3435,6 @@ ALIGN function_align
jmp .h16_main
ALIGN function_align
.h16:
- %assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [wq+15]
test angled, 0x400
@@ -3661,7 +3640,6 @@ ALIGN function_align
RET
ALIGN function_align
.h32:
- %assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea maxbased, [wq+31]
and maxbased, 31
@@ -3890,7 +3868,6 @@ ALIGN function_align
RET
ALIGN function_align
.h64:
- %assign stack_offset org_stack_offset
ALLOC_STACK -128, 16
lea maxbased, [wq+63]
test angled, 0x400 ; !enable_intra_edge_filter
@@ -4221,6 +4198,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
movzx filterd, byte filterm
%endif
shl filterd, 6
+ WIN64_SPILL_XMM 9, 15
add filterq, r6
lea r6, [ipred_filter_avx2_table]
movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
@@ -4234,7 +4212,6 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
mov hd, hm
jmp wq
.w4:
- WIN64_SPILL_XMM 9
mova xm8, [base+filter_shuf2]
sub tlq, 3
sub tlq, hq
@@ -4251,8 +4228,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
RET
ALIGN function_align
.w8:
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 10
+ WIN64_PUSH_XMM 10
mova m8, [base+filter_shuf1]
FILTER_XMM 7, 0, 6, [base+filter_shuf2]
vpbroadcastd m0, [tlq+4]
@@ -4278,26 +4254,18 @@ ALIGN function_align
RET
ALIGN function_align
.w16:
-%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
- %assign xmm_regs_used 15
- %assign stack_size_padded 0x98
- SUB rsp, stack_size_padded
-%endif
sub hd, 2
- TAIL_CALL .w16_main, 0
-.w16_main:
+ call .w16_main
%if WIN64
- movaps [rsp+0xa8], xmm6
- movaps [rsp+0xb8], xmm7
- movaps [rsp+0x28], xmm8
- movaps [rsp+0x38], xmm9
- movaps [rsp+0x48], xmm10
- movaps [rsp+0x58], xmm11
- movaps [rsp+0x68], xmm12
- movaps [rsp+0x78], xmm13
- movaps [rsp+0x88], xmm14
+ jmp .end
+%else
+ RET
%endif
+.w16_main:
+ ; The spills are into the callers stack frame
+ %assign stack_size stack_size + gprsize
+ WIN64_PUSH_XMM 15, 9
+ %assign stack_size stack_size - gprsize
FILTER_XMM 12, 0, 7, [base+filter_shuf2]
vpbroadcastd m0, [tlq+5]
vpblendd m0, [tlq-12], 0x14
@@ -4350,7 +4318,6 @@ ALIGN function_align
ret
ALIGN function_align
.w32:
- sub rsp, stack_size_padded
sub hd, 2
lea r3, [dstq+16]
lea r5d, [hq-2]
@@ -4415,6 +4382,7 @@ ALIGN function_align
shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm6
+.end:
RET
ALIGN function_align
.main:
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
index 976f33a24b..f6b0cad001 100644
--- a/third_party/dav1d/src/x86/ipred_sse.asm
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -670,10 +670,7 @@ ALIGN function_align
RET
ALIGN function_align
.w32:
-%if WIN64
- movaps [rsp+24], xmm7
- %define xmm_regs_used 8
-%endif
+ WIN64_PUSH_XMM 8, 7
mova m7, m5
.w32_loop_init:
mov r3d, 2
@@ -705,10 +702,7 @@ ALIGN function_align
RET
ALIGN function_align
.w64:
-%if WIN64
- movaps [rsp+24], xmm7
- %define xmm_regs_used 8
-%endif
+ WIN64_PUSH_XMM 8, 7
mova m7, m5
.w64_loop_init:
mov r3d, 4
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
index 01eb6fa348..b5c73a51d4 100644
--- a/third_party/dav1d/src/x86/looprestoration_sse.asm
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -42,7 +42,6 @@ pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_right_ext_mask: times 24 db 0xff
times 8 db 0
pb_1: times 16 db 1
-pb_3: times 16 db 3
pw_256: times 8 dw 256
pw_2056: times 8 dw 2056
pw_m16380: times 8 dw -16380
@@ -290,7 +289,7 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
- movd m2, [lpfq-4]
+ movd m2, [lpfq-1]
%if ARCH_X86_64
push r0
lea r0, [pb_right_ext_mask+21]
@@ -302,10 +301,11 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid
movu m1, [r6+xq+8]
%endif
%if cpuflag(ssse3)
- pshufb m2, [base+pb_3]
+ pxor m3, m3
+ pshufb m2, m3
%else
punpcklbw m2, m2
- pshuflw m2, m2, q3333
+ pshuflw m2, m2, q0000
punpcklqdq m2, m2
%endif
pand m4, m0
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
index 61eeaa1007..42e2a5525e 100644
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -1337,7 +1337,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp wd, 4
je .h_w4
jl .h_w2
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 13
shr mxd, 16
sub srcq, 6
@@ -1415,7 +1414,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp hd, 4
cmovle myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastd m6, [pd_32]
vpbroadcastw m7, r8m
@@ -1590,7 +1588,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .v_w8_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vpbroadcastw m15, r8m
cmp wd, 4
@@ -2046,7 +2043,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
shr mxd, 16
sub srcq, 6
vpbroadcastq m0, [base+subpel_filters+mxq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
vbroadcasti128 m6, [subpel_h_shufA]
vbroadcasti128 m7, [subpel_h_shufB]
@@ -2125,7 +2121,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
cmp hd, 4
cmovle myd, mxd
vpbroadcastq m0, [base+subpel_filters+myq*8]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 15
vpbroadcastd m7, [prep_8tap_1d_rnd]
lea r6, [strideq*3]
@@ -2264,7 +2259,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
%endif
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
vpbroadcastd m15, [prep_8tap_2d_rnd]
cmp wd, 4
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
index 585ba53e08..e5de7ecd96 100644
--- a/third_party/dav1d/src/x86/mc16_avx512.asm
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -2377,7 +2377,6 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
jg .hv_w16_loop
RET
.hv_w32:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 32
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
@@ -3175,7 +3174,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
jg .hv_w8_loop
RET
.hv_w16:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 27
vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
@@ -3313,7 +3311,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
RET
.hv_w32:
%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
PUSH r8
%assign regs_used regs_used + 1
WIN64_SPILL_XMM 32
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
index fde8e372a3..b0c42597f7 100644
--- a/third_party/dav1d/src/x86/mc16_sse.asm
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -1302,10 +1302,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .h_w4_loop
RET
.h_w8:
-%if WIN64
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
-%endif
shr mxd, 16
movq m3, [base+subpel_filters+mxq*8]
movifnidn dstq, dstmp
@@ -1383,14 +1380,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmp hd, 6
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
-%if WIN64
WIN64_SPILL_XMM 15
-%endif
movd m7, r8m
movifnidn dstq, dstmp
movifnidn dsq, dsmp
@@ -1604,11 +1594,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
jg .v_w4_loop0
RET
.hv:
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
%if ARCH_X86_32
movd m4, r8m
mova m6, [base+pd_512]
@@ -1750,11 +1736,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
cmovb myd, mxd
movq m3, [base+subpel_filters+myq*8]
%if ARCH_X86_32
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
mov dstq, dstmp
mov dsq, dsmp
mova m0, [base+spel_h_shufA]
@@ -2182,11 +2164,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
cmp hd, 4
cmove myd, mxd
movq m3, [base+subpel_filters+myq*8]
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
WIN64_SPILL_XMM 15
movddup m7, [base+prep_8tap_1d_rnd]
movifnidn ssq, r2mp
@@ -2339,11 +2316,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
jg .v_loop0
RET
.hv:
-%if STACK_ALIGNMENT < 16
- %xdefine rstk rsp
-%else
- %assign stack_offset stack_offset - stack_size_padded
-%endif
+ RESET_STACK_STATE
movzx t3d, mxb
shr mxd, 16
cmp wd, 4
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
index 3b208033bd..58e3cb5af1 100644
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -1259,7 +1259,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
@@ -1620,7 +1619,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_loop
RET
.v:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
@@ -1834,7 +1832,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .v_w16_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
cmp wd, 4
jg .hv_w8
@@ -2247,7 +2244,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .h_loop
RET
.v:
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 16
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
@@ -2430,8 +2426,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_w16_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
WIN64_SPILL_XMM 16
cmp wd, 4
je .hv_w4
@@ -4108,10 +4102,9 @@ cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
beta, filter, tmp1, delta, my, gamma
%if WIN64
- sub rsp, 0xa0
%assign xmm_regs_used 16
%assign stack_size_padded 0xa0
- %assign stack_offset stack_offset+stack_size_padded
+ SUB rsp, stack_size_padded
%endif
call .main
jmp .start
@@ -4134,21 +4127,13 @@ cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha,
RET
ALIGN function_align
.main:
- ; Stack args offset by one (r4m -> r5m etc.) due to call
-%if WIN64
- mov abcdq, r5m
- mov mxd, r6m
- movaps [rsp+stack_offset+0x10], xmm6
- movaps [rsp+stack_offset+0x20], xmm7
- movaps [rsp+0x28], xmm8
- movaps [rsp+0x38], xmm9
- movaps [rsp+0x48], xmm10
- movaps [rsp+0x58], xmm11
- movaps [rsp+0x68], xmm12
- movaps [rsp+0x78], xmm13
- movaps [rsp+0x88], xmm14
- movaps [rsp+0x98], xmm15
-%endif
+ ; Stack is offset due to call
+ %assign stack_offset stack_offset + gprsize
+ %assign stack_size stack_size + gprsize
+ %assign stack_size_padded stack_size_padded + gprsize
+ movifnidn abcdq, abcdmp
+ movifnidn mxd, mxm
+ WIN64_PUSH_XMM
movsx alphad, word [abcdq+2*0]
movsx betad, word [abcdq+2*1]
mova m12, [warp_8x8_shufA]
@@ -4162,7 +4147,7 @@ ALIGN function_align
lea tmp2d, [alphaq*3]
sub srcq, tmp1q ; src -= src_stride*3 + 3
sub betad, tmp2d ; beta -= alpha*3
- mov myd, r7m
+ mov myd, r6m
call .h
psrld m1, m0, 16
call .h
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
index 7897f1decc..f9043f1ad3 100644
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -1276,7 +1276,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
@@ -2853,8 +2852,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .v_loop0
RET
.hv:
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
WIN64_SPILL_XMM 16
cmp wd, 4
je .hv_w4
diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm
index 54939c647a..a447a80161 100644
--- a/third_party/dav1d/src/x86/mc_sse.asm
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@@ -1199,7 +1199,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
RET
.v:
%if notcpuflag(ssse3)
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
%endif
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
@@ -1375,7 +1374,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
-%assign stack_offset stack_offset - stack_size_padded
%if cpuflag(ssse3)
imul mxyd, 0x08000800
WIN64_SPILL_XMM 8
@@ -1592,7 +1590,6 @@ FN put_8tap, regular, REGULAR, REGULAR
%endif
cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
-%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
%if ARCH_X86_64
@@ -1618,7 +1615,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
movzx wd, word [base_reg+wq*2+table_offset(put,)]
add wq, base_reg
; put_bilin mangling jump
-%assign stack_offset org_stack_offset
movifnidn dsq, dsmp
movifnidn ssq, ssmp
%if WIN64
@@ -1792,7 +1788,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
- %assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
shr myd, 16
@@ -2048,7 +2043,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%undef subpel2
%undef subpel3
.hv:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
cmp wd, 4
jg .hv_w8
%if ARCH_X86_32
@@ -2369,7 +2364,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
%undef subpelv2
%undef subpelv3
.hv_w8:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
%define hv8_line_1 0
%define hv8_line_2 1
%define hv8_line_3 2
@@ -2843,7 +2838,6 @@ FN prep_8tap, regular, REGULAR, REGULAR
%define base 0
%endif
cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
-%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
@@ -2862,7 +2856,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
- %assign stack_offset org_stack_offset
%if WIN64
pop r8
pop r7
@@ -3095,7 +3088,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
mov mxd, myd
and mxd, 0x7f
%else
- %assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
%endif
@@ -3359,7 +3351,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpel2
%undef subpel3
.hv:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
cmp wd, 4
jg .hv_w8
and mxd, 0x7f
@@ -3659,7 +3651,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%undef subpelv2
%undef subpelv3
.hv_w8:
- %assign stack_offset org_stack_offset
+ RESET_STACK_STATE
%define hv8_line_1 0
%define hv8_line_2 1
%define hv8_line_3 2
diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm
index 9f05c921a6..4156efe914 100644
--- a/third_party/dav1d/src/x86/msac.asm
+++ b/third_party/dav1d/src/x86/msac.asm
@@ -143,10 +143,9 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov esp, [esp]
%endif
%endif
- not t4
sub t2d, t1d ; rng
shl t1, gprsize*8-16
- add t4, t1 ; ~dif
+ sub t4, t1 ; dif - v
.renorm3:
mov t1d, [t0+msac.cnt]
movifnidn t7, t0
@@ -157,33 +156,31 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
shl t2d, cl
shl t4, cl
mov [t7+msac.rng], t2d
- not t4
sub t1d, ecx
jae .end ; no refill required
; refill:
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if ARCH_X86_64 == 0
push t5
%endif
- lea t5, [t2+gprsize]
- cmp t5, rcx
+ mov t2, [t7+msac.buf]
+ mov t5, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t5
ja .refill_eob
- mov t2, [t2]
- lea ecx, [t1+23]
- add t1d, 16
- shr ecx, 3 ; shift_bytes
- bswap t2
- sub t5, rcx
- shl ecx, 3 ; shift_bits
- shr t2, cl
- sub ecx, t1d ; shift_bits - 16 - cnt
- mov t1d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t5
- sub t1d, ecx ; cnt + gprsize*8 - shift_bits
- xor t4, t2
+ mov t5, [t2]
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ shr ecx, 3 ; num_bytes_read
+ or t4, t5
+.refill_end:
+ add t2, rcx
+ lea t1d, [t1+rcx*8] ; cnt += num_bits_read
+ mov [t7+msac.buf], t2
+.refill_end2:
%if ARCH_X86_64 == 0
pop t5
%endif
@@ -191,29 +188,35 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
mov [t7+msac.cnt], t1d
mov [t7+msac.dif], t4
RET
+.pad_with_ones:
+ lea ecx, [t1-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp .refill_end2
.refill_eob: ; avoid overreading the input buffer
- mov t5, rcx
- mov ecx, gprsize*8-24
- sub ecx, t1d ; c
-.refill_eob_loop:
cmp t2, t5
- jae .refill_eob_end ; eob reached
- movzx t1d, byte [t2]
- inc t2
- shl t1, cl
- xor t4, t1
- sub ecx, 8
- jge .refill_eob_loop
-.refill_eob_end:
- mov t1d, gprsize*8-24
-%if ARCH_X86_64 == 0
- pop t5
-%endif
- sub t1d, ecx
- mov [t7+msac.buf], t2
- mov [t7+msac.dif], t4
- mov [t7+msac.cnt], t1d
- RET
+ jae .pad_with_ones ; eob reached
+ ; We can safely do a register-sized load of the last bytes of the buffer
+ ; as this code is only reached if the msac buffer size is >= gprsize.
+ mov t5, [t5-gprsize]
+ shl ecx, 3
+ shr t5, cl
+ lea ecx, [t1+16-gprsize*8]
+ not t5
+ bswap t5
+ shr t5, cl
+ neg ecx
+ or t4, t5
+ mov t5d, [t7+msac.end]
+ shr ecx, 3
+ sub t5d, t2d ; num_bytes_left
+ cmp ecx, t5d
+ cmovae ecx, t5d ; num_bytes_read
+ jmp .refill_end
cglobal msac_decode_symbol_adapt8, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
@@ -366,7 +369,6 @@ cglobal msac_decode_bool_adapt, 0, 6, 0
%if ARCH_X86_64 == 0
movzx eax, al
%endif
- not t4
test t3d, t3d
jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
%if UNIX64 == 0
@@ -420,7 +422,6 @@ cglobal msac_decode_bool_equi, 0, 6, 0
mov ecx, 0xbfff
setb al ; the upper 32 bits contains garbage but that's OK
sub ecx, t2d
- not t4
; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
; i.e. (0 <= d <= 2) and v < (3 << 14)
shr ecx, 14 ; d
@@ -447,7 +448,6 @@ cglobal msac_decode_bool, 0, 6, 0
cmovb t2d, t1d
cmovb t4, t3
setb al
- not t4
%if ARCH_X86_64 == 0
movzx eax, al
%endif
@@ -497,48 +497,45 @@ cglobal msac_decode_bool, 0, 6, 0
tzcnt eax, eax
movzx ecx, word [buf+rax+16]
movzx t2d, word [buf+rax+14]
- not t4
%if ARCH_X86_64
add t6d, 5
%endif
sub eax, 5 ; setup for merging the tok_br and tok branches
sub t2d, ecx
shl rcx, gprsize*8-16
- add t4, rcx
+ sub t4, rcx
bsr ecx, t2d
xor ecx, 15
shl t2d, cl
shl t4, cl
movd m2, t2d
mov [t7+msac.rng], t2d
- not t4
sub t5d, ecx
jae %%end
- mov t2, [t7+msac.buf]
- mov rcx, [t7+msac.end]
%if UNIX64 == 0
push t8
%endif
- lea t8, [t2+gprsize]
- cmp t8, rcx
+ mov t2, [t7+msac.buf]
+ mov t8, [t7+msac.end]
+ lea rcx, [t2+gprsize]
+ sub rcx, t8
ja %%refill_eob
- mov t2, [t2]
- lea ecx, [t5+23]
- add t5d, 16
+ mov t8, [t2]
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
shr ecx, 3
- bswap t2
- sub t8, rcx
- shl ecx, 3
- shr t2, cl
- sub ecx, t5d
- mov t5d, gprsize*8-16
- shl t2, cl
- mov [t7+msac.buf], t8
+ or t4, t8
+%%refill_end:
+ add t2, rcx
+ lea t5d, [t5+rcx*8]
+ mov [t7+msac.buf], t2
+%%refill_end2:
%if UNIX64 == 0
pop t8
%endif
- sub t5d, ecx
- xor t4, t2
%%end:
movp m3, t4
%if ARCH_X86_64
@@ -559,27 +556,34 @@ cglobal msac_decode_bool, 0, 6, 0
shr eax, 1
mov [t7+msac.cnt], t5d
RET
+%%pad_with_ones:
+ ; ensure that dif is padded with at least 15 bits of ones at the end
+ lea ecx, [t5-16]
+%if ARCH_X86_64
+ ror rcx, cl
+%else
+ shr ecx, cl
+%endif
+ or t4, rcx
+ jmp %%refill_end2
%%refill_eob:
- mov t8, rcx
- mov ecx, gprsize*8-24
- sub ecx, t5d
-%%refill_eob_loop:
cmp t2, t8
- jae %%refill_eob_end
- movzx t5d, byte [t2]
- inc t2
- shl t5, cl
- xor t4, t5
- sub ecx, 8
- jge %%refill_eob_loop
-%%refill_eob_end:
-%if UNIX64 == 0
- pop t8
-%endif
- mov t5d, gprsize*8-24
- mov [t7+msac.buf], t2
- sub t5d, ecx
- jmp %%end
+ jae %%pad_with_ones
+ mov t8, [t8-gprsize]
+ shl ecx, 3
+ shr t8, cl
+ lea ecx, [t5+16-gprsize*8]
+ not t8
+ bswap t8
+ shr t8, cl
+ neg ecx
+ or t4, t8
+ mov t8d, [t7+msac.end]
+ shr ecx, 3
+ sub t8d, t2d
+ cmp ecx, t8d
+ cmovae ecx, t8d
+ jmp %%refill_end
%endmacro
cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6