diff options
Diffstat (limited to 'third_party/dav1d/src/x86/filmgrain_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain_sse.asm | 3233 |
1 files changed, 3233 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm new file mode 100644 index 0000000000..0172f98760 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_sse.asm @@ -0,0 +1,3233 @@ +; Copyright © 2019-2021, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +SECTION_RODATA + +pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 +pb_27_17: times 8 db 27, 17 +pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 + +SECTION .text + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + movd m2, [base+round+r2*2] + movd m0, [fg_dataq+FGData.seed] + mova m5, [base+pb_mask] + pshuflw m2, m2, q0000 + pshuflw m0, m0, q0000 + mov r2, -73*82 + sub bufq, r2 + lea r3, [base+gaussian_sequence] +.loop: + pand m6, m0, m1 + psrlw m3, m6, 10 + por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m6, m4 ; bits 0x0f00 are set + pshufb m3, m5, m6 ; set 15th bit for next 4 seeds + psllq m6, m3, 30 + por m3, m6 + psllq m6, m3, 15 + por m3, m6 ; aggregate each bit into next seed's high bit + pmulhuw m6, m0, m7 + por m3, m6 ; 4 next output seeds + pshuflw m0, m3, q3333 + psrlw m3, 5 +%if ARCH_X86_64 + movq r6, m3 + mov r8, r6 + movzx r5d, r6w + shr r6d, 16 + shr r8, 32 + movzx r7, r8w + shr r8, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + pinsrw m6, [r3+r7*2], 2 + pinsrw m6, [r3+r8*2], 3 +%else + movd r6, m3 + pshuflw m3, m3, q3232 + movzx r5, r6w + shr r6, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + + movd r6, m3 + movzx r5, r6w + shr r6, 16 + + pinsrw m6, [r3+r5*2], 2 + pinsrw m6, [r3+r6*2], 3 +%endif + pmulhrsw m6, m2 + packsswb m6, m6 + movd [bufq+r2], m6 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] + jmp r2 + +.ar1: +%if ARCH_X86_32 + DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max +%elif WIN64 + DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 + mov bufq, r0 +%else + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov ecx, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_32 + mov r1m, cf3d + DEFINE_ARGS buf, shift, val3, min, max, x, val0 +%define hd r0mp +%define cf3d r1mp +%elif WIN64 + DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 +%else + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 +%endif + pxor m6, m6 + pcmpgtb m7, m6, m4 + punpcklbw m4, m7 + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pshuflw m3, m3, q0000 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + movq m0, [bufq+xq-82-1] ; top/left + pcmpgtb m7, m6, m0 + punpcklbw m0, m7 + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend+1] + SCRATCH 7, 15, 7 + movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pxor m7, m7 + pshuflw m6, m6, q0000 + punpcklwd m6, m7 + pcmpgtb m4, m7, m0 + pcmpgtb m5, m7, m1 + punpcklbw m0, m4 + punpcklbw m1, m5 + DEFINE_ARGS buf, fg_data, h, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m7, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m6, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m6, m1 + psrldq m5, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m5, m1 + pmaddwd m4, m9 + pmaddwd m6, m10 + pmaddwd m5, m12 + paddd m4, m6 + paddd m2, m5 + paddd m2, m4 + paddd m2, m14 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pcmpgtb m4, m7, m0 + punpcklbw m1, m0, m4 + pmaddwd m3, m1, m13 + paddd m3, m2 + psrldq m1, 4 ; y=0,x=0 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw m3, m1 + packsswb m3, m3 + pslldq m3, 2 + pand m3, m15 + pandn m1, m15, m0 + por m0, m1, m3 + psrldq m0, 1 + ; overwrite 2 pixels, but that's ok + movd [bufq+xq-1], m0 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if ARCH_X86_32 +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 +%elif WIN64 + SUB rsp, 16*6 +%assign stack_size_padded (stack_size_padded+16*6) +%assign stack_size (stack_size+16*6) +%else + ALLOC_STACK -16*6 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend] + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pxor m3, m3 + pcmpgtb m4, m3, m0 + pcmpgtb m3, m2 + pshuflw m6, m6, q0000 + SCRATCH 6, 14, 12 + SCRATCH 7, 15, 13 + punpckhbw m1, m0, m4 + punpcklbw m0, m4 + punpcklbw m2, m3 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m5, m0, q3333 + pshufd m0, m0, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m3 + mova [rsp+ 2*16], m4 + mova [rsp+ 3*16], m5 + pshufd m6, m1, q1111 + pshufd m7, m1, q2222 + pshufd m5, m1, q3333 + pshufd m1, m1, q0000 + pshufd m3, m2, q1111 + psrldq m0, m2, 10 + pinsrw m2, [base+pw_1], 5 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + pinsrw m0, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m6 + SCRATCH 7, 8, 6 + SCRATCH 5, 9, 7 + SCRATCH 2, 10, 8 + SCRATCH 3, 11, 9 + SCRATCH 4, 12, 10 + SCRATCH 0, 13, 11 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m2, m0, m3 + punpcklbw m0, m3 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m5, m5 + pcmpgtb m5, m1 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + palignr m6, m2, m0, 10 + palignr m7, m2, m0, 12 + psrldq m0, 8 + punpcklwd m0, m6 + punpcklwd m7, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m7, [rsp+ 3*16] + paddd m0, m7 + paddd m0, m4 + + psrldq m4, m1, 2 + psrldq m5, m1, 4 + psrldq m6, m1, 6 + psrldq m7, m1, 8 + punpcklwd m4, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 4*16] + pmaddwd m6, [rsp+ 5*16] + paddd m4, m6 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m5, m2, m7 + punpcklbw m2, m7 + palignr m7, m3, m1, 10 + palignr m3, m1, 12 + psrldq m1, m2, 2 + punpcklwd m7, m3 + punpcklwd m3, m2, m1 + pmaddwd m7, m8 + pmaddwd m3, m9 + paddd m7, m3 + paddd m0, m7 + + psrldq m6, m2, 4 + psrldq m1, m2, 6 + psrldq m3, m2, 8 + palignr m4, m5, m2, 10 + palignr m5, m5, m2, 12 + + punpcklwd m6, m1 + punpcklwd m3, m4 + punpcklwd m5, m14 + pmaddwd m6, m10 + pmaddwd m3, m11 + pmaddwd m5, m12 + paddd m0, m6 + paddd m3, m5 + paddd m0, m3 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pxor m5, m5 + pcmpgtb m5, m1 + punpcklbw m2, m1, m5 + pmaddwd m2, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb m2, m2 + pslldq m2, 3 + pand m2, m15 + pandn m3, m15, m1 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv + movifnidn r2, r2mp + movifnidn r3, r3mp + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movd m6, [base+round+r5*2] + mova m5, [base+pb_mask] + movd m0, [fg_dataq+FGData.seed] + movd m2, [base+pw_seed_xor+uvq*4] + pxor m0, m2 + pshuflw m6, m6, q0000 + pshuflw m0, m0, q0000 + lea r6, [base+gaussian_sequence] +%if %2 +%if ARCH_X86_64 + mov r7d, 73-35*%3 +%else + mov r3mp, 73-35*%3 +%endif + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m3, m2 + psllq m2, m3, 15 + por m3, m2 ; aggregate each bit into next seed's high bit + pmulhuw m2, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + movd r9d, m2 + pshuflw m2, m2, q3232 + movzx r8, r9w + shr r9, 16 + + movd m3, [r6+r8*2] + pinsrw m3, [r6+r9*2], 1 + + movd r9d, m2 + movzx r8, r9w + shr r9, 16 + + pinsrw m3, [r6+r8*2], 2 + pinsrw m3, [r6+r9*2], 3 +%else + movd r2, m2 + pshuflw m2, m2, q3232 + movzx r1, r2w + shr r2, 16 + + movd m3, [r6+r1*2] + pinsrw m3, [r6+r2*2], 1 + + movd r2, m2 + movzx r1, r2w + shr r2, 16 + + pinsrw m3, [r6+r1*2], 2 + pinsrw m3, [r6+r2*2], 3 +%endif + pmulhrsw m3, m6 + packsswb m3, m3 + movd [bufq+r5], m3 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 +%if ARCH_X86_64 + dec r7d +%else + dec r3mp +%endif + jg .loop_y +%else + jl .loop +%endif + +%if ARCH_X86_32 + mov r2, r2mp +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -2*16 +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd m4, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h, x + pxor m0, m0 + pcmpgtb m0, m5 + punpcklbw m5, m0 + movd m7, [base+pb_1] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif + pshuflw m5, m5, q0000 + pshuflw m4, m4, q0000 + pshufd m7, m7, q0000 +%if %2 + pshuflw m6, m6, q0000 +%endif + punpcklqdq m5, m5 + punpcklqdq m4, m4 +%if %2 + punpcklqdq m6, m6 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 + SCRATCH 1, 8, 0 + SCRATCH 4, 9, 1 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: + xor xd, xd +.x_loop_ar0: + ; first 32 pixels +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 +%endif + pmaddubsw m2, m7, m3 +%if %3 + pmaddubsw m3, m7, m4 + paddw m0, m1 + paddw m2, m3 +%endif + pmulhrsw m0, m6 + pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif + pmullw m0, m5 + pmullw m2, m5 + pmulhrsw m0, m9 + pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif + packsswb m0, m2 +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 + paddw m0, m1 +%endif + pmulhrsw m0, m6 + pmullw m0, m5 + pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 + packsswb m0, m0 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movu [bufq+xq-16], m2 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] + pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 +%if ARCH_X86_32 + mov r3mp, cf3d + DEFINE_ARGS buf, shift, fg_data, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x + mov bufq, r0 +%else + DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + movd m7, [base+pb_1] + movd m6, [base+hmul_bits+2+%3*2] +%endif + psrldq m4, 1 +%if ARCH_X86_32 + DEFINE_ARGS buf, shift, val0, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 +%else + DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 +%endif + pxor m5, m5 + punpcklwd m3, m5 +%if %2 + punpcklwd m6, m6 +%endif + pcmpgtb m5, m4 + punpcklbw m4, m5 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + pshufd m3, m3, q0000 +%if %2 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif +%if ARCH_X86_32 + add r1mp, 79+82*3 + mov r0mp, 70-35*%3 +%else + add bufyq, 79+82*3 + mov hd, 70-35*%3 +%endif + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: +%if %2 +%if ARCH_X86_32 + mov r2, r1mp + movq m0, [r2+xq*2] +%if %3 + movq m1, [r2+xq*2+82] +%endif +%else + movq m0, [bufyq+xq*2] +%if %3 + movq m1, [bufyq+xq*2+82] +%endif +%endif + pmaddubsw m2, m7, m0 +%if %3 + pmaddubsw m0, m7, m1 + paddw m2, m0 +%endif + pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif + + movq m0, [bufq+xq-82-1] ; top/left + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 + psrldq m1, m0, 4 ; top/right + punpcklwd m1, m2 + psrldq m2, m0, 2 ; top + punpcklwd m0, m2 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 +%if ARCH_X86_32 + imul val3d, r3mp +%else + imul val3d, cf3d +%endif + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 +%if ARCH_X86_32 + add r1mp, 82<<%3 + dec r0mp +%else + add bufyq, 82<<%3 + dec hd +%endif + jg .y_loop_ar1 + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp + ALLOC_STACK -8*16 +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + movd m7, [base+round_vals-12+shiftq*2] + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 + pxor m2, m2 + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + pinsrw m1, [base+pw_1], 5 + punpcklwd m7, m7 + pshufd m7, m7, q0000 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + SCRATCH 7, 15, 7 +%if %2 + movd m7, [base+hmul_bits+2+%3*2] + movd m6, [base+pb_1] + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pxor m2, m2 + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m0, m1 + psrldq m3, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + pmaddwd m4, m9 + pmaddwd m0, m10 + pmaddwd m3, m12 + paddd m4, m0 + paddd m2, m3 + paddd m2, m4 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 + paddw m0, m1 +%endif + pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif + punpcklwd m0, m15 + pmaddwd m0, m14 + paddd m2, m0 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] + pxor m4, m4 + movd m5, [base+byte_blend+1] + punpcklbw m5, m5 +.x_loop_ar2_inner: + pcmpgtb m1, m4, m0 + punpcklbw m0, m1 + pmaddwd m3, m0, m13 + paddd m3, m2 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + pslldq m3, 4 + pand m3, m5 + paddw m0, m3 + packsswb m0, m0 + movd [bufq+xq-2], m0 + psrldq m0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 + ALLOC_STACK -15*16 +%else + SUB rsp, 16*7 +%assign stack_size_padded (stack_size_padded+16*7) +%assign stack_size (stack_size+16*7) +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + pshufd m2, m0, q1111 + pshufd m3, m0, q2222 + pshufd m4, m0, q3333 + pshufd m0, m0, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m7, m1, q3333 + pshufd m1, m1, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m2 + mova [rsp+ 2*16], m3 + mova [rsp+ 3*16], m4 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m5 + mova [rsp+ 6*16], m6 + SCRATCH 7, 8, 7 + + movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] + pxor m4, m4 + pcmpgtb m4, m2 + punpckhbw m5, m2, m4 + punpcklbw m2, m4 + pshufd m4, m2, q3232 + punpcklwd m3, m4, m5 + pshuflw m5, m4, q3321 + pshufd m4, m3, q0000 + pshufd m3, m2, q1111 + pshufd m2, m2, q0000 + pinsrw m5, [base+round_vals+shiftq*2-10], 3 + SCRATCH 2, 9, 8 + SCRATCH 3, 10, 9 + SCRATCH 4, 11, 10 + SCRATCH 5, 12, 11 + + movd m2, [base+round_vals-12+shiftq*2] +%if %2 + movd m1, [base+pb_1] + movd m3, [base+hmul_bits+2+%3*2] +%endif + pxor m0, m0 + punpcklwd m2, m0 +%if %2 + punpcklwd m3, m3 +%endif + pshufd m2, m2, q0000 +%if %2 + pshufd m1, m1, q0000 + pshufd m3, m3, q0000 + SCRATCH 1, 13, 12 +%endif + SCRATCH 2, 14, 13 +%if %2 + SCRATCH 3, 15, 14 +%endif + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m4, m4 + pcmpgtb m4, m0 + punpckhbw m3, m0, m4 + punpcklbw m0, m4 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + palignr m2, m3, m0, 10 + palignr m3, m0, 12 + psrldq m0, 8 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m6, m6 + pcmpgtb m6, m1 + punpckhbw m5, m1, m6 + punpcklbw m1, m6 + + punpcklwd m0, m2 + punpcklwd m3, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m3, [rsp+ 3*16] + paddd m0, m3 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m6, m2, m7 + punpcklbw m2, m7 + + palignr m3, m5, m1, 10 + palignr m5, m1, 12 + psrldq m4, m2, 2 + + punpcklwd m3, m5 + punpcklwd m5, m2, m4 + pmaddwd m3, [rsp+ 6*16] + pmaddwd m5, m8 + paddd m3, m5 + paddd m0, m3 + + psrldq m3, m1, 2 + psrldq m4, m1, 4 + psrldq m5, m1, 6 + psrldq m1, 8 + + punpcklwd m3, m4 + punpcklwd m5, m1 + pmaddwd m3, [rsp+ 4*16] + pmaddwd m5, [rsp+ 5*16] + paddd m3, m5 + paddd m0, m3 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 + paddw m7, m5 +%endif + pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif + + psrldq m1, m2, 4 + psrldq m3, m2, 6 + palignr m4, m6, m2, 10 + palignr m6, m2, 12 + psrldq m2, 8 + + punpcklwd m1, m3 + punpcklwd m2, m4 + punpcklwd m6, m7 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m6, m11 + paddd m1, m2 + paddd m0, m6 + paddd m0, m1 + paddd m0, m14 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] + pxor m4, m4 + movd m5, [base+byte_blend] +.x_loop_ar3_inner: + pcmpgtb m2, m4, m1 + punpcklbw m3, m1, m2 + pmaddwd m2, m3, m12 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + packsswb m2, m2 + pandn m3, m5, m1 + pslld m2, 24 + pand m2, m5 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 6 +%define %%tmp %6 +%endif +%rep 4 +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4] +%else + pinsrw %1, [%3+%4], %%idx + 0 +%endif + pinsrw %1, [%3+%5], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +INIT_XMM ssse3 +; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 +%else +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, picptrq +%else +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r7, [pb_mask] +%define base r7-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + movd m4, [base+max+r6*4] + movd m5, [base+min+r6*2] + punpcklwd m3, m3 + punpcklwd m4, m4 + punpcklwd m5, m5 + pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + pshufd m5, m5, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz .no_vertical_overlap + mova m6, [base+pw_1024] + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 + test sbyd, sbyd + jnz .vertical_overlap + ; fall-through + +.no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused +%endif + +.loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .next_blk + + add offxyd, 16 + test dword r8m, 2 ; r8m & 2 = have_top_overlap + jz .loop_x_odd + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jnz .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 + + add offxyd, 16 ; left_offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+0*gprsize] + movd m7, [grain_lutq+r5] +%else + movd m7, [grain_lutq+left_offxyq] +%endif + punpcklbw m7, m3 + pmaddubsw m6, m15, m7 + pmulhrsw m6, m14 + packsswb m6, m6 + shufps m6, m3, q3210 + pcmpgtb m2, m6 + punpcklbw m7, m6, m2 + punpckhbw m6, m2 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m7, m4 + pmullw m6, m5 + pmulhrsw m7, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 + + ; since this half-block had left-overlap, the next does not + test dword r8m, 2 ; have_top_overlap + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed, + ; because of the 'and tmpd, 0x00ff00ff' above + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+5*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 +%else + mova m8, [pb_27_17] +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] + movu m7, [grain_lutq+r5] +%else + movu m7, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m7, m3 + punpcklbw m7, m3 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m3, [r5], m6 + pmaddubsw m6, [r5], m7 +%else + pmaddubsw m3, m8, m6 + pmaddubsw m6, m8, m7 +%endif + pmulhrsw m3, m14 + pmulhrsw m6, m14 + packsswb m6, m3 + pcmpgtb m7, m2, m6 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m6, m5 + pmulhrsw m2, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 + + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak + + mov r5, [rsp+5*mmsize+1*gprsize] + mov r4, offxyd + add r5, 16 + add r4, 16 + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak + + xor tmpd, tmpd + mov seed, r3m +%else + mova m8, [pb_27_17] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut + + movzx r5, offxyw ; top_offxy + mov [rsp+5*mmsize+1*gprsize], r5 +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw +%endif + shr offxyd, 16 + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy + movu m6, [grain_lutq+r5] + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy + movd m4, [grain_lutq+r0] + movd m7, [grain_lutq+r5] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] + movd m7, [grain_lutq+topleft_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m4, m3 + punpcklbw m7, m6 + pmaddubsw m2, m15, m4 + pmaddubsw m4, m15, m7 + pmulhrsw m2, m14 + pmulhrsw m4, m14 + packsswb m2, m2 + packsswb m4, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m3, m4, m2 + punpckhbw m4, m2 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m7, [r5], m4 + pmaddubsw m4, [r5], m3 +%else + pmaddubsw m7, m8, m4 + pmaddubsw m4, m8, m3 +%endif + pmulhrsw m7, m14 + pmulhrsw m4, m14 + packsswb m4, m7 + pxor m2, m2 + pcmpgtb m7, m2, m4 + punpcklbw m3, m4, m7 + punpckhbw m4, m7 + + ; src + mova m0, [srcq] + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 +%else + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 +%endif + REPX {psrlw x, 8}, m5, m6 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m3, m5 + pmullw m4, m6 + pmulhrsw m3, m11 + pmulhrsw m4, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, +; sby, luma, lstride, uv_pl, is_id) +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 + + mov r0, r8m + mov r1, r9m + mov r2, r10m + mov r4, r11m + mov r3, r12m + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 +%else +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, r5 +%else +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + lea tmpd, [r6d*2] +%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize + test r3, r3 +%else + cmp dword r12m, 0 ; is_idm +%endif + movd m5, [base+min+r6*2] + cmovne r6d, tmpd + movd m4, [base+max+r6*2] + punpcklwd m3, m3 + punpcklwd m5, m5 + punpcklwd m4, m4 + pshufd m3, m3, q0000 + pshufd m5, m5, q0000 + pshufd m4, m4, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + +%if %1 + mov r6d, dword r11m + movd m0, [fg_dataq+FGData.uv_mult+r6*4] + movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklbw m6, m1, m0 + movd m7, [fg_dataq+FGData.uv_offset+r6*4] + punpcklwd m6, m6 + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz %%no_vertical_overlap +%if ARCH_X86_32 +%if %2 + mova m1, [base+pb_23_22_h] +%else + mova m1, [base+pb_27_17_17_27] +%endif + mova m0, [base+pw_1024] +%else +%if %2 + mova m1, [pb_23_22_h] +%else + mova m1, [pb_27_17_17_27] +%endif + mova m0, [pw_1024] +%endif + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 + test sbyd, sbyd + jnz %%vertical_overlap + ; fall-through + +%%no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak +%define luma_bakq lumaq + + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride, luma_bak +%endif + +%%loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] + pcmpgtb m6, m2, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; we already incremented lumaq above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: +%endif + test dword r8m, 1 + jz %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 +%if %2 + lea r6, [offxyd+16] + mov [rsp+7*mmsize+0*gprsize], r6 +%else + mov [rsp+7*mmsize+0*gprsize], offxyd +%endif + + DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + +%if %2 + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq+ 0] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] +%else + movd m2, [grain_lutq+left_offxyq+ 0] +%endif + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 + pxor m4, m4 + pcmpgtb m4, m3 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif + + ; r8m = sbym + test dword r8m, 2 +%if %2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif + +%%end: + RET + +%%vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor tmpd, tmpd +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%endif + +%%loop_x_odd_v_overlap: + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] +%else + mova m1, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_v_overlap: +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+1*gprsize] + movu m4, [grain_lutq+r0] +%else + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + packsswb m3, m2 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; unpack chroma_source + pxor m4, m4 + punpckhbw m6, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m6, m3 + pmaxsw m0, m13 + pmaxsw m6, m13 + pminsw m0, m12 + pminsw m6, m12 + packuswb m0, m6 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + dec hw + je %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m1, [PIC_ptr(pb_17_27)] + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused + + mov r6, [rsp+7*mmsize+1*gprsize] +%if %2 + lea r0, [r3d+16] + add r6, 16 + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy +%endif + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused + + mov seed, r3m + xor tmpd, tmpd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + +%if %2 + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] +%else + mova m3, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + + ; src +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else +%if %3 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 +%else + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 +%endif +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack grain + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + punpckhbw m1, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m1, m5 + pmulhrsw m2, m11 + pmulhrsw m1, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; unpack chroma source + pxor m4, m4 + punpckhbw m5, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m5, m1 + pmaxsw m0, m13 + pmaxsw m5, m13 + pminsw m0, m12 + pminsw m5, m12 + packuswb m0, m5 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has been adjusted above already +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif +%endif + add grain_lutq, 82 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m3, [PIC_ptr(pb_17_27)] + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 |