diff options
Diffstat (limited to 'third_party/dav1d/src/x86/film_grain.asm')
-rw-r--r-- | third_party/dav1d/src/x86/film_grain.asm | 2405 |
1 files changed, 2405 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/film_grain.asm b/third_party/dav1d/src/x86/film_grain.asm new file mode 100644 index 0000000000..72a1e3c009 --- /dev/null +++ b/third_party/dav1d/src/x86/film_grain.asm @@ -0,0 +1,2405 @@ +; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_8x_27_17_8x_17_27: times 8 db 27, 17 + times 8 db 17, 27 +pw_1024: times 16 dw 1024 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pd_m65536: dd ~0xffff +pb_23_22: times 2 db 23, 22 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pb_27_17_17_27: db 27, 17, 17, 27 +pw_1: dw 1 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +ALIGN 4 +JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +INIT_XMM avx2 +cglobal generate_grain_y, 2, 9, 16, buf, fg_data + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r2*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastd xm9, [base+pd_m65536] + mov r2, -73*82 + sub bufq, r2 + lea r3, [gaussian_sequence] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r3+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r2], xm2 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_avx2_table+r2*4] + lea r2, [r2+base+generate_grain_y_avx2_table] + jmp r2 + +.ar1: + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 + pinsrb xm4, [pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left + pmovsxbw xm2, [bufq+xq-82+0] ; top + pmovsxbw xm1, [bufq+xq-82+1] ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d +%if WIN64 + sarx val3d, val3d, shiftd +%else + sar val3d, shiftb +%endif + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend+1] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pmovsxbw xm9, xm9 + DEFINE_ARGS buf, fg_data, h, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 + pmovzxwd xm14, xm14 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm6, xm1 + psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm7, xm1 + pmaddwd xm4, xm9 + pmaddwd xm6, xm10 + pmaddwd xm7, xm12 + paddd xm4, xm6 + paddd xm2, xm7 + paddd xm2, xm4 + paddd xm2, xm14 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm1, xm0 + pmaddwd xm3, xm1, xm13 + paddd xm3, xm2 + psrldq xm1, 4 ; y=0,x=0 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw xm3, xm1 + packsswb xm3, xm3 + pextrb [bufq+xq], xm3, 0 + pslldq xm3, 2 + pand xm3, xm15 + pandn xm0, xm15, xm0 + por xm0, xm3 + psrldq xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if WIN64 + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) +%else + ALLOC_STACK 16*12 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + psrldq xm13, xm2, 10 + pinsrw xm2, [pw_1], 5 + pshufd xm4, xm2, q2222 + pshufd xm2, xm2, q0000 + pinsrw xm13, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm14 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm2, xm1 + pmaddwd xm2, xm13 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb xm2, xm2 + pextrb [bufq+xq], xm2, 0 + pslldq xm2, 3 + pand xm2, xm15 + pandn xm1, xm15, xm1 + por xm1, xm2 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r5*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] + pxor xm0, xm9 + vpbroadcastd xm9, [base+pd_m65536] + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -73*82 + sub bufq, r5 +.loop: +%endif + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r5], xm2 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 + dec r7d + jg .loop_y +%else + jl .loop +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_avx2_table] + jmp r5 + +.ar0: + INIT_YMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd xm3, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h + pmovsxbw xm4, xm4 +%if %2 + vpbroadcastd m7, [pb_1] + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif + vpbroadcastw m4, xm4 + vpbroadcastw m3, xm3 + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm8, [bufyq] +%if %3 + movu xm9, [bufyq+82] +%endif + movu xm10, [bufyq+16] +%if %3 + movu xm11, [bufyq+82+16] +%endif + vinserti128 m8, [bufyq+32], 1 +%if %3 + vinserti128 m9, [bufyq+82+32], 1 +%endif + vinserti128 m10, [bufyq+48], 1 +%if %3 + vinserti128 m11, [bufyq+82+48], 1 +%endif + pmaddubsw m8, m7, m8 +%if %3 + pmaddubsw m9, m7, m9 +%endif + pmaddubsw m10, m7, m10 +%if %3 + pmaddubsw m11, m7, m11 + paddw m8, m9 + paddw m10, m11 +%endif + pmulhrsw m8, m6 + pmulhrsw m10, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m8, [bufyq+r3] + pcmpgtb m9, m12, m8 + punpckhbw m10, m8, m9 + punpcklbw m8, m9 +%endif + pmullw m8, m4 + pmullw m10, m4 + pmulhrsw m8, m3 + pmulhrsw m10, m3 +%if %2 + movu m0, [bufq] +%else + movu m0, [bufq+r3] +%endif + pcmpgtb m1, m12, m0 + punpckhbw m9, m0, m1 + punpcklbw m0, m1 + paddw m0, m8 + paddw m9, m10 + packsswb m0, m9 +%if %2 + movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif + + ; last 6/12 pixels + movu xm8, [bufyq+32*2] +%if %2 +%if %3 + movu xm9, [bufyq+32*2+82] +%endif + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 + pmullw xm8, xm4 + pmulhrsw xm8, xm3 + movq xm0, [bufq+32] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm9, xm0, xm9 + paddw xm8, xm9 + packsswb xm8, xm8 + vpblendw xm0, xm8, xm0, 1000b + movq [bufq+32], xm0 +%else + pcmpgtb xm9, xm12, xm8 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + pmullw xm10, xm4 + pmullw xm8, xm4 + pmulhrsw xm10, xm3 + pmulhrsw xm8, xm3 + movu xm0, [bufq+64] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm1, xm0, xm9 + punpckhbw xm9, xm0, xm9 + paddw xm1, xm8 + paddw xm9, xm10 + packsswb xm1, xm9 + vpblendw xm0, xm1, xm0, 11000000b + movu [bufq+64], xm0 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + INIT_XMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + vpbroadcastd xm7, [pb_1] + vpbroadcastw xm6, [hmul_bits+2+%3*2] +%endif + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 + movq xm8, [bufyq+xq*2] +%if %3 + movq xm9, [bufyq+xq*2+82] +%endif +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm15, [base+round_vals-12+shiftq*2] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 + pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 + pinsrw xm9, [base+pw_1], 5 +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] + vpbroadcastd xm6, [base+pb_1] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm14, xm9, q2222 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm0, xm1 + psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm3, xm1 + pmaddwd xm4, xm9 + pmaddwd xm0, xm10 + pmaddwd xm3, xm12 + paddd xm4, xm0 + paddd xm2, xm3 + paddd xm2, xm4 + +%if %2 + movq xm0, [bufyq+xq*2] +%if %3 + movq xm3, [bufyq+xq*2+82] +%endif + pmaddubsw xm0, xm6, xm0 +%if %3 + pmaddubsw xm3, xm6, xm3 + paddw xm0, xm3 +%endif + pmulhrsw xm0, xm7 +%else + pmovsxbw xm0, [bufyq+xq] +%endif + punpcklwd xm0, xm15 + pmaddwd xm0, xm14 + paddd xm2, xm0 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm0, xm0 + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + pslldq xm3, 2 + psrldq xm0, 2 + paddw xm3, xm0 + vpblendw xm0, xm3, 00000010b + packsswb xm0, xm0 + pextrb [bufq+xq], xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 + pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + pshufd xm4, xm2, q2222 + vpbroadcastw xm5, xm5 + vpblendw xm4, xm5, 10101010b ; interleave luma cf + psrldq xm5, xm2, 10 + pshufd xm2, xm2, q0000 + pinsrw xm5, [base+round_vals+shiftq*2-10], 3 + pmovzxwd xm14, xm14 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + mova [rsp+11*16], xm5 +%if %2 + vpbroadcastd xm13, [base+pb_1] + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + +%if %2 + movq xm1, [bufyq+xq*2] +%if %3 + movq xm2, [bufyq+xq*2+82] +%endif + pmaddubsw xm1, xm13, xm1 +%if %3 + pmaddubsw xm2, xm13, xm2 + paddw xm1, xm2 +%endif + pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm1 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + paddd xm0, xm14 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm1, xm1 + pmaddwd xm2, xm1, [rsp+16*11] + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + pslldq xm2, 6 + vpblendw xm1, xm2, 1000b + packsswb xm1, xm1 + pextrb [bufq+xq], xm1, 3 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +INIT_YMM avx2 +cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut + pcmpeqw m10, m10 + psrld m10, 24 + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + vpbroadcastw m12, [base+max+r7*4] + vpbroadcastw m13, [base+min+r7*2] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + + ; r8m = sbym + movd xm15, [pb_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) + movd xm14, [pw_1024] +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 + packsswb xm4, xm4 + vpblendw xm4, xm3, 11111110b + vpblendd m3, m4, 00001111b + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + vpbroadcastd m14, [pw_1024] +.loop_x_v_overlap: + vpbroadcastw m15, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m6, m15, m6 + pmaddubsw m4, m15, m4 + pmulhrsw m6, m14 + pmulhrsw m4, m14 + packsswb m3, m4, m6 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 32 + jge .end_hv + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movd xm15, [pb_27_17_17_27] +.loop_x_hv_overlap: + vpbroadcastw m8, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+32] + lea left_offxyq, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel + vpgatherdd m9, [scalingq+m4], m3 + pcmpeqw m3, m3 + vpgatherdd m4, [scalingq+m5], m3 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq+m6], m3 + pcmpeqw m3, m3 + vpgatherdd m6, [scalingq+m7], m3 + pand m9, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m9, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + pmaddubsw xm4, xm15, xm4 + pmaddubsw xm7, xm15, xm7 + pmulhrsw xm4, xm14 + pmulhrsw xm7, xm14 + packsswb xm4, xm4 + packsswb xm7, xm7 + vpblendw xm4, xm3, 11111110b + vpblendw xm7, xm6, 11111110b + vpblendd m3, m4, 00001111b + vpblendd m6, m7, 00001111b + ; followed by v interpolation (top | cur -> cur) + punpckhbw m7, m6, m3 + punpcklbw m6, m3 + pmaddubsw m7, m8, m7 + pmaddubsw m6, m8, m6 + pmulhrsw m7, m14 + pmulhrsw m6, m14 + packsswb m3, m6, m7 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m9 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq] + jl .loop_x_hv_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id + pcmpeqw m10, m10 + psrld m10, 24 + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, dword is_idm + vpbroadcastw m13, [base+min+r7*2] + shlx r7d, r7d, r9d + vpbroadcastw m12, [base+max+r7*2] + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, dword r11m + vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] + vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklbw m14, m1, m0 + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] +%else + vpbroadcastd m14, [pw_1024] +%if %2 + vpbroadcastd m15, [pb_23_22] +%else + vpbroadcastd xm15, [pb_27_17_17_27] +%endif +%endif + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, unused5, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + pxor m2, m2 + mova m4, [lumaq] + mova m0, [srcq] +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm3, [grain_lutq+offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 + jg %%loop_y + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 +%if %1 + vpbroadcastd m6, [pb_23_22] ; FIXME +%endif + movu xm3, [grain_lutq+offxyq+ 0] + movd xm4, [grain_lutq+left_offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 + punpcklbw m4, m3 +%if %1 + pmaddubsw m4, m6, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + pcmpeqw m6, m6 ; FIXME + psrldq m6, 15 ; FIXME + vpblendvb m3, m3, m4, m6 +%else +%if %1 + vpbroadcastd xm6, [pb_27_17_17_27] +%endif + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 +%if %1 + pmaddubsw xm4, xm6, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm6, xm6 + psrldq xm6, 14 + vpblendvb m3, m3, m4, m6 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, overlap, unused1, unused2, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused1, unused2, see, overlap, unused3, unused4, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_v_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + mova m6, [pb_8x_27_17_8x_17_27] + movu xm3, [grain_lutq+offxyq] + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m9, m4, m3 + punpcklbw m4, m3 +%if %2 + pmaddubsw m9, m6, m9 + pmaddubsw m4, m6, m4 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m4, m1, m4 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m4, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m4, m14 +%endif + packsswb m3, m4, m9 +%else +%if %1 + vpbroadcastd m6, [pb_23_22] +%endif + movq xm3, [grain_lutq+offxyq] + movq xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+8], 1 + vinserti128 m4, [grain_lutq+top_offxyq+8], 1 + punpcklbw m4, m3 +%if %1 + pmaddubsw m4, m6, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + vpermq m4, m4, q3120 + ; only interpolate first line, insert second line unmodified + vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + + sub hb, 1+%2 + jl %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_hv_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m8, [scalingq+m4], m9 + vpgatherdd m4, [scalingq+m5], m3 + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq+m6], m9 + vpgatherdd m6, [scalingq+m7], m3 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %1 +%if %2 + vpbroadcastd m9, [pb_23_22] +%else + vpbroadcastd xm9, [pb_27_17_17_27] +%endif +%endif + +%if %2 + movu xm3, [grain_lutq+offxyq] +%if %3 + movq xm6, [grain_lutq+top_offxyq] +%else + movu xm6, [grain_lutq+top_offxyq] +%endif + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%if %3 + vinserti128 m6, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+top_offxyq+82], 1 +%endif +%else + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] +%endif + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] +%if %2 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 +%if %3 == 0 + vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 +%endif +%endif + + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) +%if %2 + punpcklbw m4, m3 +%if %3 + punpcklbw xm7, xm6 +%else + punpcklbw m7, m6 +%endif + punpcklwd m4, m7 +%if %1 + pmaddubsw m4, m9, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + pcmpeqw m9, m9 ; this is kind of ugly + psrldq m9, 15 + vpblendvb m3, m3, m4, m9 + psrldq m4, 1 +%if %3 + shufpd m9, m9, m9, 1110b ; clear upper lane +%endif + vpblendvb m6, m6, m4, m9 +%else + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + punpckldq xm4, xm7 +%if %1 + pmaddubsw xm4, xm9, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm9, xm9 ; this is kind of ugly + psrldq xm9, 14 + vpblendvb m3, m3, m4, m9 + psrldq xm4, 2 + vpblendvb m6, m6, m4, m9 +%endif + + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m9, m3, q3120 + punpcklbw m6, m9 +%if %1 + vpbroadcastd m9, [pb_23_22] + pmaddubsw m6, m9, m6 + pmulhrsw m6, [pw_1024] +%else + pmaddubsw m6, m15, m6 + pmulhrsw m6, m14 +%endif + packsswb m6, m6 + vpermq m6, m6, q3120 + vpblendd m3, m3, m6, 00001111b +%else + punpckhbw m9, m6, m3 + punpcklbw m6, m3 +%if %2 + mova m3, [pb_8x_27_17_8x_17_27] + pmaddubsw m9, m3, m9 + pmaddubsw m6, m3, m6 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m6, m1, m6 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m6, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m6, m14 +%endif + packsswb m3, m6, m9 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 + jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 |