diff options
Diffstat (limited to 'third_party/dav1d/src/x86')
-rw-r--r-- | third_party/dav1d/src/x86/cdef_avx2.asm | 7 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain16_avx2.asm | 23 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain16_sse.asm | 8 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain_avx2.asm | 19 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain_sse.asm | 14 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/ipred16_avx2.asm | 18 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/ipred_avx2.asm | 106 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/ipred_sse.asm | 10 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/looprestoration_sse.asm | 8 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc16_avx2.asm | 6 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc16_avx512.asm | 3 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc16_sse.asm | 33 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc_avx2.asm | 33 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc_avx512.asm | 3 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/mc_sse.asm | 16 | ||||
-rw-r--r-- | third_party/dav1d/src/x86/msac.asm | 172 |
16 files changed, 157 insertions, 322 deletions
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm index 1f30f8a3b7..95d35fc1c8 100644 --- a/third_party/dav1d/src/x86/cdef_avx2.asm +++ b/third_party/dav1d/src/x86/cdef_avx2.asm @@ -398,7 +398,6 @@ SECTION .text INIT_YMM avx2 cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge -%assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block @@ -1195,9 +1194,9 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ .border_block: DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge -%define rstk rsp -%assign stack_offset stack_offset_entry -%assign regs_used 11 + RESET_STACK_STATE + %assign stack_offset stack_offset - (regs_used - 11) * gprsize + %assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm index a1d4c41f27..eda6035923 100644 --- a/third_party/dav1d/src/x86/filmgrain16_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm @@ -646,18 +646,9 @@ INIT_XMM avx2 INIT_YMM avx2 .ar2: %if WIN64 - ; xmm6 and xmm7 already saved - %assign xmm_regs_used 13 + %2 %assign stack_size_padded 136 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 -%if %2 - movaps [rsp+16*7], xmm13 -%endif + WIN64_PUSH_XMM 13 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] @@ -747,20 +738,10 @@ INIT_YMM avx2 .ar3: %if WIN64 - ; xmm6 and xmm7 already saved %assign stack_offset 32 - %assign xmm_regs_used 14 + %2 %assign stack_size_padded 152 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 - movaps [rsp+16*7], xmm13 -%if %2 - movaps [rsp+16*8], xmm14 -%endif + WIN64_PUSH_XMM 14 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm index 6b0daaac0b..25d01caa19 100644 --- a/third_party/dav1d/src/x86/filmgrain16_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm @@ -275,7 +275,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax .ar2: %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, bdmax, shift @@ -428,7 +427,6 @@ cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax %elif ARCH_X86_64 %define tmp rsp+stack_offset-72 %else -%assign stack_offset stack_offset_old ALLOC_STACK -16*12 %define tmp rsp mov bdmaxd, bdmaxm @@ -715,7 +713,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset_old stack_offset ALLOC_STACK -16*2 mov bufyq, r1m mov uvd, r3m @@ -831,9 +828,7 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x %else -%assign stack_offset stack_offset_old -%xdefine rstk rsp -%assign stack_size_padded 0 + RESET_STACK_STATE DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 mov bufyq, r1m mov uvd, r3m @@ -1159,7 +1154,6 @@ cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %endif %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset stack_offset_old ALLOC_STACK -16*14 mov bufyq, r1m mov uvd, r3m diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm index 55445cf593..91d8ca5c14 100644 --- a/third_party/dav1d/src/x86/filmgrain_avx2.asm +++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm @@ -204,18 +204,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data .ar2: %if WIN64 - ; xmm6 and xmm7 already saved - %assign xmm_regs_used 16 %assign stack_size_padded 168 SUB rsp, stack_size_padded - movaps [rsp+16*2], xmm8 - movaps [rsp+16*3], xmm9 - movaps [rsp+16*4], xmm10 - movaps [rsp+16*5], xmm11 - movaps [rsp+16*6], xmm12 - movaps [rsp+16*7], xmm13 - movaps [rsp+16*8], xmm14 - movaps [rsp+16*9], xmm15 + WIN64_PUSH_XMM 16, 8 %endif DEFINE_ARGS buf, fg_data, h, x mov r6d, [fg_dataq+FGData.ar_coeff_shift] @@ -287,15 +278,9 @@ cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data INIT_YMM avx2 .ar3: %if WIN64 - ; xmm6 and xmm7 already saved - %assign stack_offset 16 ALLOC_STACK 16*14 %assign stack_size stack_size - 16*4 - %assign xmm_regs_used 12 - movaps [rsp+16*12], xmm8 - movaps [rsp+16*13], xmm9 - movaps [rsp+16*14], xmm10 - movaps [rsp+16*15], xmm11 + WIN64_PUSH_XMM 12, 8 %else ALLOC_STACK 16*12 %endif diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm index 0172f98760..d06e349a8c 100644 --- a/third_party/dav1d/src/x86/filmgrain_sse.asm +++ b/third_party/dav1d/src/x86/filmgrain_sse.asm @@ -232,7 +232,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data .ar2: %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, shift @@ -333,7 +332,6 @@ cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data .ar3: DEFINE_ARGS buf, fg_data, shift %if ARCH_X86_32 -%assign stack_offset stack_offset_old ALLOC_STACK -16*14 %elif WIN64 SUB rsp, 16*6 @@ -601,7 +599,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 -%assign stack_offset_old stack_offset ALLOC_STACK -2*16 %endif imul uvd, 28 @@ -738,9 +735,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar1: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp + RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x imul uvd, 28 @@ -881,9 +876,6 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar2: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp ALLOC_STACK -8*16 %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift @@ -1014,9 +1006,7 @@ cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_dat .ar3: %if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp + RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm index f4931e977b..7b52abaa10 100644 --- a/third_party/dav1d/src/x86/ipred16_avx2.asm +++ b/third_party/dav1d/src/x86/ipred16_avx2.asm @@ -946,7 +946,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w4_loop RET .w8: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vpbroadcastw m0, [tlq] ; bottom vbroadcasti128 m7, [tlq+hq*2+2] @@ -974,7 +973,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w8_loop RET .w16: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] @@ -1005,7 +1003,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w16_loop RET .w32: -%assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] @@ -1047,7 +1044,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights jg .w32_loop RET .w64: -%assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base mov dst_baseq, dstq mov tl_baseq, tlq @@ -1104,7 +1100,6 @@ cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights RET cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z1_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem @@ -1312,7 +1307,6 @@ ALIGN function_align .w4_end: RET .w8: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 7 lea r3d, [angleq+216] mov r3b, hb @@ -1476,7 +1470,6 @@ ALIGN function_align or maxbased, 16 ; imin(h+15, 31) jmp .w16_main .w16: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 7 lea maxbased, [hq+15] test angled, 0x400 @@ -1622,7 +1615,6 @@ ALIGN function_align .w16_end: RET .w32: - %assign stack_offset org_stack_offset ALLOC_STACK -160, 8 lea maxbased, [hq+31] mov r3d, 63 @@ -1737,7 +1729,6 @@ ALIGN function_align .w32_end: RET .w64: - %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [hq+63] test angled, 0x400 @@ -2691,7 +2682,6 @@ ALIGN function_align jmp .w32_filter_above cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z3_16bpc_avx2_table] tzcnt hd, hm movifnidn angled, anglem @@ -2907,7 +2897,6 @@ ALIGN function_align RET .h8: lea r4d, [angleq+216] - %assign stack_offset org_stack_offset ALLOC_STACK -64, 8 mov r4b, wb lea r7, [strideq*3] @@ -3155,7 +3144,6 @@ ALIGN function_align jmp .h16_main ALIGN function_align .h16: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 10 lea maxbased, [wq+15] lea r7, [strideq*3] @@ -3372,7 +3360,6 @@ ALIGN function_align .h16_end: RET .h32: - %assign stack_offset org_stack_offset ALLOC_STACK -160, 9 lea maxbased, [wq+31] and maxbased, 31 @@ -3557,7 +3544,6 @@ ALIGN function_align .h32_end: RET .h64: - %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [wq+63] test angled, 0x400 @@ -3804,7 +3790,6 @@ ALIGN function_align ; 5 8 8 i cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter -%assign org_stack_offset stack_offset %define base r6-ipred_filter_16bpc_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm @@ -3846,7 +3831,6 @@ cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vbroadcasti128 m14, [base+filter_shuf3] vpbroadcastw m15, r8m ; bitdepth_max @@ -3883,7 +3867,6 @@ ALIGN function_align RET ALIGN function_align .w16: - %assign stack_offset stack_offset - stack_size_padded ALLOC_STACK 32, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 @@ -3977,7 +3960,6 @@ ALIGN function_align ret ALIGN function_align .w32: - %assign stack_offset org_stack_offset ALLOC_STACK 64, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm index 58e40935ac..35738e7c0b 100644 --- a/third_party/dav1d/src/x86/ipred_avx2.asm +++ b/third_party/dav1d/src/x86/ipred_avx2.asm @@ -772,7 +772,6 @@ ALIGN function_align RET ALIGN function_align .w32: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 6 movu m3, [tlq+1] punpcklbw m2, m3, m5 @@ -823,29 +822,17 @@ ALIGN function_align jl .w64_loop RET -%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used - %assign stack_offset 0 - %assign stack_size_padded 0 - %assign regs_used %2 - %xdefine rstk rsp - SETUP_STACK_POINTER %1 - %if regs_used != %2 && WIN64 - PUSH r%2 - %endif - ALLOC_STACK %1, %3 -%endmacro - cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h -%define base r6-ipred_smooth_h_avx2_table - lea r6, [ipred_smooth_h_avx2_table] +%define base r5-ipred_smooth_h_avx2_table + lea r5, [ipred_smooth_h_avx2_table] mov wd, wm vpbroadcastb m3, [tlq+wq] ; right tzcnt wd, wd mov hd, hm - movsxd wq, [r6+wq*4] + movsxd wq, [r5+wq*4] vpbroadcastd m4, [base+pb_127_m127] vpbroadcastd m5, [base+pw_128] - add wq, r6 + add wq, r5 jmp wq .w4: WIN64_SPILL_XMM 8 @@ -891,7 +878,6 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 vbroadcasti128 m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] @@ -927,7 +913,7 @@ ALIGN function_align RET ALIGN function_align .w16: - SETUP_STACK_FRAME 32*4, 7, 8 + ALLOC_STACK 32*4, 8 lea r3, [rsp+64*2-4] call .prep ; only worthwhile for for w16 and above sub tlq, 2 @@ -951,7 +937,7 @@ ALIGN function_align RET ALIGN function_align .w32: - SETUP_STACK_FRAME 32*4, 7, 6 + ALLOC_STACK 32*4 lea r3, [rsp+64*2-2] call .prep dec tlq @@ -971,19 +957,19 @@ ALIGN function_align RET ALIGN function_align .w64: - SETUP_STACK_FRAME 32*4, 7, 9 + ALLOC_STACK 32*4, 9 lea r3, [rsp+64*2-2] call .prep - add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table + add r5, smooth_weights+16*15-ipred_smooth_h_avx2_table dec tlq - mova xm5, [r6-16*7] - vinserti128 m5, [r6-16*5], 1 - mova xm6, [r6-16*6] - vinserti128 m6, [r6-16*4], 1 - mova xm7, [r6-16*3] - vinserti128 m7, [r6-16*1], 1 - mova xm8, [r6-16*2] - vinserti128 m8, [r6-16*0], 1 + mova xm5, [r5-16*7] + vinserti128 m5, [r5-16*5], 1 + mova xm6, [r5-16*6] + vinserti128 m6, [r5-16*4], 1 + mova xm7, [r5-16*3] + vinserti128 m7, [r5-16*1], 1 + mova xm8, [r5-16*2] + vinserti128 m8, [r5-16*0], 1 .w64_loop: vpbroadcastb m2, [tlq+hq] punpcklbw m2, m3 @@ -1113,7 +1099,6 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vbroadcasti128 m11, [base+smooth_weights+8*2] @@ -1157,7 +1142,9 @@ ALIGN function_align RET ALIGN function_align .w16: - SETUP_STACK_FRAME 32*4, 7, 14 + %assign regs_used 4 + ALLOC_STACK -32*4, 14 + %assign regs_used 7 vbroadcasti128 m11, [tlq+1] lea r3, [rsp+64*2-4] punpcklbw m10, m11, m0 ; top, bottom @@ -1197,7 +1184,9 @@ ALIGN function_align RET ALIGN function_align .w32: - SETUP_STACK_FRAME 32*4, 7, 11 + %assign regs_used 4 + ALLOC_STACK -32*4, 11 + %assign regs_used 7 movu m8, [tlq+1] lea r3, [rsp+64*2-2] punpcklbw m7, m8, m0 @@ -1232,7 +1221,9 @@ ALIGN function_align RET ALIGN function_align .w64: - SETUP_STACK_FRAME 32*8, 7, 16 + %assign regs_used 4 + ALLOC_STACK -32*8, 16 + %assign regs_used 7 movu m13, [tlq+1 ] movu m15, [tlq+33] add r6, smooth_weights+16*15-ipred_smooth_avx2_table @@ -1316,7 +1307,6 @@ ALIGN function_align ret cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm movifnidn angled, anglem @@ -1415,7 +1405,6 @@ ALIGN function_align pmovmskb r5d, m1 ret .w4_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -16, 11 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter @@ -1522,7 +1511,6 @@ ALIGN function_align mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 - %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 movu xm2, [z_filter_s+6] mova xm0, [tlq-1] @@ -1592,7 +1580,6 @@ ALIGN function_align or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [hq+7] test angled, 0x400 @@ -1696,7 +1683,6 @@ ALIGN function_align jmp .w16_main ALIGN function_align .w16: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [hq+15] test angled, 0x400 @@ -1816,7 +1802,6 @@ ALIGN function_align RET ALIGN function_align .w32: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea r3d, [hq+31] mov maxbased, 63 @@ -1960,7 +1945,6 @@ ALIGN function_align RET ALIGN function_align .w64: - %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter @@ -3001,7 +2985,6 @@ ALIGN function_align jmp .w32_filter_above cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase - %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm movifnidn angled, anglem @@ -3102,7 +3085,6 @@ ALIGN function_align pmovmskb r5d, m1 ret .h4_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -16, 12 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter @@ -3215,7 +3197,6 @@ ALIGN function_align mov r4b, wb cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 - %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 and r4d, 4 mova xm0, [tlq-15] @@ -3297,7 +3278,6 @@ ALIGN function_align or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: - %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [wq+7] test angled, 0x400 @@ -3455,7 +3435,6 @@ ALIGN function_align jmp .h16_main ALIGN function_align .h16: - %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [wq+15] test angled, 0x400 @@ -3661,7 +3640,6 @@ ALIGN function_align RET ALIGN function_align .h32: - %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea maxbased, [wq+31] and maxbased, 31 @@ -3890,7 +3868,6 @@ ALIGN function_align RET ALIGN function_align .h64: - %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter @@ -4221,6 +4198,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter movzx filterd, byte filterm %endif shl filterd, 6 + WIN64_SPILL_XMM 9, 15 add filterq, r6 lea r6, [ipred_filter_avx2_table] movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 @@ -4234,7 +4212,6 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter mov hd, hm jmp wq .w4: - WIN64_SPILL_XMM 9 mova xm8, [base+filter_shuf2] sub tlq, 3 sub tlq, hq @@ -4251,8 +4228,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter RET ALIGN function_align .w8: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 10 + WIN64_PUSH_XMM 10 mova m8, [base+filter_shuf1] FILTER_XMM 7, 0, 6, [base+filter_shuf2] vpbroadcastd m0, [tlq+4] @@ -4278,26 +4254,18 @@ ALIGN function_align RET ALIGN function_align .w16: -%if WIN64 - %assign stack_offset stack_offset - stack_size_padded - %assign xmm_regs_used 15 - %assign stack_size_padded 0x98 - SUB rsp, stack_size_padded -%endif sub hd, 2 - TAIL_CALL .w16_main, 0 -.w16_main: + call .w16_main %if WIN64 - movaps [rsp+0xa8], xmm6 - movaps [rsp+0xb8], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 + jmp .end +%else + RET %endif +.w16_main: + ; The spills are into the callers stack frame + %assign stack_size stack_size + gprsize + WIN64_PUSH_XMM 15, 9 + %assign stack_size stack_size - gprsize FILTER_XMM 12, 0, 7, [base+filter_shuf2] vpbroadcastd m0, [tlq+5] vpblendd m0, [tlq-12], 0x14 @@ -4350,7 +4318,6 @@ ALIGN function_align ret ALIGN function_align .w32: - sub rsp, stack_size_padded sub hd, 2 lea r3, [dstq+16] lea r5d, [hq-2] @@ -4415,6 +4382,7 @@ ALIGN function_align shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 +.end: RET ALIGN function_align .main: diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm index 976f33a24b..f6b0cad001 100644 --- a/third_party/dav1d/src/x86/ipred_sse.asm +++ b/third_party/dav1d/src/x86/ipred_sse.asm @@ -670,10 +670,7 @@ ALIGN function_align RET ALIGN function_align .w32: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif + WIN64_PUSH_XMM 8, 7 mova m7, m5 .w32_loop_init: mov r3d, 2 @@ -705,10 +702,7 @@ ALIGN function_align RET ALIGN function_align .w64: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif + WIN64_PUSH_XMM 8, 7 mova m7, m5 .w64_loop_init: mov r3d, 4 diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm index 01eb6fa348..b5c73a51d4 100644 --- a/third_party/dav1d/src/x86/looprestoration_sse.asm +++ b/third_party/dav1d/src/x86/looprestoration_sse.asm @@ -42,7 +42,6 @@ pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_right_ext_mask: times 24 db 0xff times 8 db 0 pb_1: times 16 db 1 -pb_3: times 16 db 3 pw_256: times 8 dw 256 pw_2056: times 8 dw 2056 pw_m16380: times 8 dw -16380 @@ -290,7 +289,7 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v jmp .v1 .extend_right: - movd m2, [lpfq-4] + movd m2, [lpfq-1] %if ARCH_X86_64 push r0 lea r0, [pb_right_ext_mask+21] @@ -302,10 +301,11 @@ cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstrid movu m1, [r6+xq+8] %endif %if cpuflag(ssse3) - pshufb m2, [base+pb_3] + pxor m3, m3 + pshufb m2, m3 %else punpcklbw m2, m2 - pshuflw m2, m2, q3333 + pshuflw m2, m2, q0000 punpcklqdq m2, m2 %endif pand m4, m0 diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm index 61eeaa1007..42e2a5525e 100644 --- a/third_party/dav1d/src/x86/mc16_avx2.asm +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -1337,7 +1337,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp wd, 4 je .h_w4 jl .h_w2 - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 @@ -1415,7 +1414,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m @@ -1590,7 +1588,6 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .v_w8_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 @@ -2046,7 +2043,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] @@ -2125,7 +2121,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] @@ -2264,7 +2259,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %endif RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm index 585ba53e08..e5de7ecd96 100644 --- a/third_party/dav1d/src/x86/mc16_avx512.asm +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -2377,7 +2377,6 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my jg .hv_w16_loop RET .hv_w32: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 32 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] @@ -3175,7 +3174,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 jg .hv_w8_loop RET .hv_w16: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 27 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 @@ -3313,7 +3311,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 RET .hv_w32: %if WIN64 - %assign stack_offset stack_offset - stack_size_padded PUSH r8 %assign regs_used regs_used + 1 WIN64_SPILL_XMM 32 diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm index fde8e372a3..b0c42597f7 100644 --- a/third_party/dav1d/src/x86/mc16_sse.asm +++ b/third_party/dav1d/src/x86/mc16_sse.asm @@ -1302,10 +1302,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .h_w4_loop RET .h_w8: -%if WIN64 - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 -%endif shr mxd, 16 movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp @@ -1383,14 +1380,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif -%if WIN64 WIN64_SPILL_XMM 15 -%endif movd m7, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp @@ -1604,11 +1594,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my jg .v_w4_loop0 RET .hv: -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE %if ARCH_X86_32 movd m4, r8m mova m6, [base+pd_512] @@ -1750,11 +1736,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE mov dstq, dstmp mov dsq, dsmp mova m0, [base+spel_h_shufA] @@ -2182,11 +2164,6 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif WIN64_SPILL_XMM 15 movddup m7, [base+prep_8tap_1d_rnd] movifnidn ssq, r2mp @@ -2339,11 +2316,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my jg .v_loop0 RET .hv: -%if STACK_ALIGNMENT < 16 - %xdefine rstk rsp -%else - %assign stack_offset stack_offset - stack_size_padded -%endif + RESET_STACK_STATE movzx t3d, mxb shr mxd, 16 cmp wd, 4 diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm index 3b208033bd..58e3cb5af1 100644 --- a/third_party/dav1d/src/x86/mc_avx2.asm +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -1259,7 +1259,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 @@ -1620,7 +1619,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_loop RET .v: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 @@ -1834,7 +1832,6 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .v_w16_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 cmp wd, 4 jg .hv_w8 @@ -2247,7 +2244,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_loop RET .v: - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having @@ -2430,8 +2426,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w16_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded - %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 @@ -4108,10 +4102,9 @@ cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ beta, filter, tmp1, delta, my, gamma %if WIN64 - sub rsp, 0xa0 %assign xmm_regs_used 16 %assign stack_size_padded 0xa0 - %assign stack_offset stack_offset+stack_size_padded + SUB rsp, stack_size_padded %endif call .main jmp .start @@ -4134,21 +4127,13 @@ cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, RET ALIGN function_align .main: - ; Stack args offset by one (r4m -> r5m etc.) due to call -%if WIN64 - mov abcdq, r5m - mov mxd, r6m - movaps [rsp+stack_offset+0x10], xmm6 - movaps [rsp+stack_offset+0x20], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 - movaps [rsp+0x98], xmm15 -%endif + ; Stack is offset due to call + %assign stack_offset stack_offset + gprsize + %assign stack_size stack_size + gprsize + %assign stack_size_padded stack_size_padded + gprsize + movifnidn abcdq, abcdmp + movifnidn mxd, mxm + WIN64_PUSH_XMM movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] mova m12, [warp_8x8_shufA] @@ -4162,7 +4147,7 @@ ALIGN function_align lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 sub betad, tmp2d ; beta -= alpha*3 - mov myd, r7m + mov myd, r6m call .h psrld m1, m0, 16 call .h diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm index 7897f1decc..f9043f1ad3 100644 --- a/third_party/dav1d/src/x86/mc_avx512.asm +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -1276,7 +1276,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 @@ -2853,8 +2852,6 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_loop0 RET .hv: - %assign stack_offset stack_offset - stack_size_padded - %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm index 54939c647a..a447a80161 100644 --- a/third_party/dav1d/src/x86/mc_sse.asm +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -1199,7 +1199,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 RET .v: %if notcpuflag(ssse3) - %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] @@ -1375,7 +1374,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] -%assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 @@ -1592,7 +1590,6 @@ FN put_8tap, regular, REGULAR, REGULAR %endif cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 -%assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 @@ -1618,7 +1615,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 movzx wd, word [base_reg+wq*2+table_offset(put,)] add wq, base_reg ; put_bilin mangling jump -%assign stack_offset org_stack_offset movifnidn dsq, dsmp movifnidn ssq, ssmp %if WIN64 @@ -1792,7 +1788,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else - %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 @@ -2048,7 +2043,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %undef subpel2 %undef subpel3 .hv: - %assign stack_offset org_stack_offset + RESET_STACK_STATE cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 @@ -2369,7 +2364,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %undef subpelv2 %undef subpelv3 .hv_w8: - %assign stack_offset org_stack_offset + RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 @@ -2843,7 +2838,6 @@ FN prep_8tap, regular, REGULAR, REGULAR %define base 0 %endif cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 -%assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 @@ -2862,7 +2856,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] - %assign stack_offset org_stack_offset %if WIN64 pop r8 pop r7 @@ -3095,7 +3088,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov mxd, myd and mxd, 0x7f %else - %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb %endif @@ -3359,7 +3351,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel2 %undef subpel3 .hv: - %assign stack_offset org_stack_offset + RESET_STACK_STATE cmp wd, 4 jg .hv_w8 and mxd, 0x7f @@ -3659,7 +3651,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpelv2 %undef subpelv3 .hv_w8: - %assign stack_offset org_stack_offset + RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm index 9f05c921a6..4156efe914 100644 --- a/third_party/dav1d/src/x86/msac.asm +++ b/third_party/dav1d/src/x86/msac.asm @@ -143,10 +143,9 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov esp, [esp] %endif %endif - not t4 sub t2d, t1d ; rng shl t1, gprsize*8-16 - add t4, t1 ; ~dif + sub t4, t1 ; dif - v .renorm3: mov t1d, [t0+msac.cnt] movifnidn t7, t0 @@ -157,33 +156,31 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 shl t2d, cl shl t4, cl mov [t7+msac.rng], t2d - not t4 sub t1d, ecx jae .end ; no refill required ; refill: - mov t2, [t7+msac.buf] - mov rcx, [t7+msac.end] %if ARCH_X86_64 == 0 push t5 %endif - lea t5, [t2+gprsize] - cmp t5, rcx + mov t2, [t7+msac.buf] + mov t5, [t7+msac.end] + lea rcx, [t2+gprsize] + sub rcx, t5 ja .refill_eob - mov t2, [t2] - lea ecx, [t1+23] - add t1d, 16 - shr ecx, 3 ; shift_bytes - bswap t2 - sub t5, rcx - shl ecx, 3 ; shift_bits - shr t2, cl - sub ecx, t1d ; shift_bits - 16 - cnt - mov t1d, gprsize*8-16 - shl t2, cl - mov [t7+msac.buf], t5 - sub t1d, ecx ; cnt + gprsize*8 - shift_bits - xor t4, t2 + mov t5, [t2] + lea ecx, [t1+16-gprsize*8] + not t5 + bswap t5 + shr t5, cl + neg ecx + shr ecx, 3 ; num_bytes_read + or t4, t5 +.refill_end: + add t2, rcx + lea t1d, [t1+rcx*8] ; cnt += num_bits_read + mov [t7+msac.buf], t2 +.refill_end2: %if ARCH_X86_64 == 0 pop t5 %endif @@ -191,29 +188,35 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov [t7+msac.cnt], t1d mov [t7+msac.dif], t4 RET +.pad_with_ones: + lea ecx, [t1-16] +%if ARCH_X86_64 + ror rcx, cl +%else + shr ecx, cl +%endif + or t4, rcx + jmp .refill_end2 .refill_eob: ; avoid overreading the input buffer - mov t5, rcx - mov ecx, gprsize*8-24 - sub ecx, t1d ; c -.refill_eob_loop: cmp t2, t5 - jae .refill_eob_end ; eob reached - movzx t1d, byte [t2] - inc t2 - shl t1, cl - xor t4, t1 - sub ecx, 8 - jge .refill_eob_loop -.refill_eob_end: - mov t1d, gprsize*8-24 -%if ARCH_X86_64 == 0 - pop t5 -%endif - sub t1d, ecx - mov [t7+msac.buf], t2 - mov [t7+msac.dif], t4 - mov [t7+msac.cnt], t1d - RET + jae .pad_with_ones ; eob reached + ; We can safely do a register-sized load of the last bytes of the buffer + ; as this code is only reached if the msac buffer size is >= gprsize. + mov t5, [t5-gprsize] + shl ecx, 3 + shr t5, cl + lea ecx, [t1+16-gprsize*8] + not t5 + bswap t5 + shr t5, cl + neg ecx + or t4, t5 + mov t5d, [t7+msac.end] + shr ecx, 3 + sub t5d, t2d ; num_bytes_left + cmp ecx, t5d + cmovae ecx, t5d ; num_bytes_read + jmp .refill_end cglobal msac_decode_symbol_adapt8, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT @@ -366,7 +369,6 @@ cglobal msac_decode_bool_adapt, 0, 6, 0 %if ARCH_X86_64 == 0 movzx eax, al %endif - not t4 test t3d, t3d jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 %if UNIX64 == 0 @@ -420,7 +422,6 @@ cglobal msac_decode_bool_equi, 0, 6, 0 mov ecx, 0xbfff setb al ; the upper 32 bits contains garbage but that's OK sub ecx, t2d - not t4 ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) ; i.e. (0 <= d <= 2) and v < (3 << 14) shr ecx, 14 ; d @@ -447,7 +448,6 @@ cglobal msac_decode_bool, 0, 6, 0 cmovb t2d, t1d cmovb t4, t3 setb al - not t4 %if ARCH_X86_64 == 0 movzx eax, al %endif @@ -497,48 +497,45 @@ cglobal msac_decode_bool, 0, 6, 0 tzcnt eax, eax movzx ecx, word [buf+rax+16] movzx t2d, word [buf+rax+14] - not t4 %if ARCH_X86_64 add t6d, 5 %endif sub eax, 5 ; setup for merging the tok_br and tok branches sub t2d, ecx shl rcx, gprsize*8-16 - add t4, rcx + sub t4, rcx bsr ecx, t2d xor ecx, 15 shl t2d, cl shl t4, cl movd m2, t2d mov [t7+msac.rng], t2d - not t4 sub t5d, ecx jae %%end - mov t2, [t7+msac.buf] - mov rcx, [t7+msac.end] %if UNIX64 == 0 push t8 %endif - lea t8, [t2+gprsize] - cmp t8, rcx + mov t2, [t7+msac.buf] + mov t8, [t7+msac.end] + lea rcx, [t2+gprsize] + sub rcx, t8 ja %%refill_eob - mov t2, [t2] - lea ecx, [t5+23] - add t5d, 16 + mov t8, [t2] + lea ecx, [t5+16-gprsize*8] + not t8 + bswap t8 + shr t8, cl + neg ecx shr ecx, 3 - bswap t2 - sub t8, rcx - shl ecx, 3 - shr t2, cl - sub ecx, t5d - mov t5d, gprsize*8-16 - shl t2, cl - mov [t7+msac.buf], t8 + or t4, t8 +%%refill_end: + add t2, rcx + lea t5d, [t5+rcx*8] + mov [t7+msac.buf], t2 +%%refill_end2: %if UNIX64 == 0 pop t8 %endif - sub t5d, ecx - xor t4, t2 %%end: movp m3, t4 %if ARCH_X86_64 @@ -559,27 +556,34 @@ cglobal msac_decode_bool, 0, 6, 0 shr eax, 1 mov [t7+msac.cnt], t5d RET +%%pad_with_ones: + ; ensure that dif is padded with at least 15 bits of ones at the end + lea ecx, [t5-16] +%if ARCH_X86_64 + ror rcx, cl +%else + shr ecx, cl +%endif + or t4, rcx + jmp %%refill_end2 %%refill_eob: - mov t8, rcx - mov ecx, gprsize*8-24 - sub ecx, t5d -%%refill_eob_loop: cmp t2, t8 - jae %%refill_eob_end - movzx t5d, byte [t2] - inc t2 - shl t5, cl - xor t4, t5 - sub ecx, 8 - jge %%refill_eob_loop -%%refill_eob_end: -%if UNIX64 == 0 - pop t8 -%endif - mov t5d, gprsize*8-24 - mov [t7+msac.buf], t2 - sub t5d, ecx - jmp %%end + jae %%pad_with_ones + mov t8, [t8-gprsize] + shl ecx, 3 + shr t8, cl + lea ecx, [t5+16-gprsize*8] + not t8 + bswap t8 + shr t8, cl + neg ecx + or t4, t8 + mov t8d, [t7+msac.end] + shr ecx, 3 + sub t8d, t2d + cmp ecx, t8d + cmovae ecx, t8d + jmp %%refill_end %endmacro cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 |