diff options
Diffstat (limited to 'third_party/dav1d/src/x86/filmgrain16_avx2.asm')
-rw-r--r-- | third_party/dav1d/src/x86/filmgrain16_avx2.asm | 2248 |
1 files changed, 2248 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm new file mode 100644 index 0000000000..a1d4c41f27 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm @@ -0,0 +1,2248 @@ +; Copyright © 2021-2022, VideoLAN and dav1d authors +; Copyright © 2021-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 +pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 +gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_27_17_17_27: dw 27, 17, 17, 27 +pw_23_22: dw 23, 22, 0, 32 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +gen_ar0_shift: times 4 db 128 + times 4 db 64 + times 4 db 32 + times 4 db 16 +pd_16: dd 16 +pd_m65536: dd -65536 +pb_1: times 4 db 1 +grain_max: times 2 dw 511 + times 2 dw 2047 +grain_min: times 2 dw -512 + times 2 dw -2048 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +uv_offset_mul: dd 256 + dd 1024 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16, 8 +round_vals: dw 32, 64, 128, 256, 512, 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_YMM avx2 +cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax +%define base r4-generate_grain_y_16bpc_avx2_table + lea r4, [generate_grain_y_16bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + mov r3, -73*82*2 + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + lea r7d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r6, r7 + mova xm6, [base+pb_mask] + sub bufq, r3 + vpbroadcastw xm7, [base+round+r6*2-2] + lea r6, [gaussian_sequence] + movsxd r5, [r4+r5*4] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm3, xm2 ; 4 next output seeds + pshuflw xm0, xm3, q3333 + psrlw xm3, 5 + pand xm2, xm0, xm1 + movq r7, xm3 + psrlw xm3, xm2, 10 + por xm2, xm3 + pmullw xm2, xm4 + pmulhuw xm0, xm5 + movzx r8d, r7w + pshufb xm3, xm6, xm2 + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm0, xm2 + movd xm2, [r6+r8*2] + rorx r8, r7, 32 + por xm3, xm0 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 1 + pshuflw xm0, xm3, q3333 + movzx r7d, r8w + psrlw xm3, 5 + pinsrw xm2, [r6+r7*2], 2 + shr r8d, 16 + movq r7, xm3 + pinsrw xm2, [r6+r8*2], 3 + movzx r8d, r7w + pinsrw xm2, [r6+r8*2], 4 + rorx r8, r7, 32 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 5 + movzx r7d, r8w + pinsrw xm2, [r6+r7*2], 6 + shr r8d, 16 + pinsrw xm2, [r6+r8*2], 7 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + mova [bufq+r3], xm2 + add r3, 8*2 + jl .loop + + ; auto-regression code + add r5, r4 + jmp r5 + +.ar1: + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 + pinsrb xm4, [base+pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 + vpbroadcastw xm10, [base+round_vals-12+shiftq*2] + pxor m1, m1 + punpcklwd xm10, xm1 + pcmpgtb m1, m0 + punpcklbw m0, m1 ; cf5-11,0-4 + vpermq m1, m0, q3333 ; cf4 + vbroadcasti128 m11, [base+gen_shufA] + pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] + vbroadcasti128 m12, [base+gen_shufB] + pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] + punpckhwd xm1, xm0 + pshufhw xm9, xm0, q2121 + pshufd xm8, xm1, q0000 ; cf[4,9] + sar bdmaxd, 1 + punpckhqdq xm9, xm9 ; cf[10,11] + movd xm4, bdmaxd ; max_grain + pcmpeqd xm5, xm5 + sub bufq, 2*(82*73-(82*3+79)) + pxor xm5, xm4 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 +.x_loop_ar2: + vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] + pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m6 + punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] + pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m7 + pmaddwd xm2, xm8 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd xm0, xm10 + paddd xm2, xm0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd xm2, xm1 + pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd xm3, xm9, xm0 + psrldq xm0, 2 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + paddd xm3, xm1 + pminsd xm3, xm4 + psrldq xm1, 4 + pmaxsd xm3, xm5 + pextrw [bufq+xq*2], xm3, 0 + punpcklwd xm3, xm3 + pblendw xm0, xm3, 0010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 + movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 + pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 + movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 + vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 + vpbroadcastw xm11, [base+round_vals+shiftq*2-12] + movd xm12, bdmaxd ; max_grain + punpcklbw m7, m7 ; sign-extension + punpcklbw m0, m0 ; sign-extension + punpcklbw xm1, xm1 + REPX {psraw x, 8}, m7, m0, xm1 + pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] + pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] + pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] + pshufd xm7, xm7, q3333 ; cf[6,13] + pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] + pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] + paddw xm0, xm11, xm11 + pcmpeqd xm13, xm13 + pblendw xm10, xm1, xm0, 00001000b + pxor xm13, xm12 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + paddd m0, m3 + psrldq m3, m2, 2 + punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m3 + psrldq m3, m2, 4 + psrldq m2, 6 + vpblendd m2, m11, 0x0f ; rounding constant + punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + vextracti128 xm2, m1, 1 + punpcklwd xm1, xm2 + pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] + paddd m0, m3 + vextracti128 xm2, m0, 1 + paddd xm0, xm1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + pblendw xm1, xm2, 0100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 + RET + +%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax +%define base r8-generate_grain_uv_%1_16bpc_avx2_table + lea r8, [generate_grain_uv_%1_16bpc_avx2_table] + movifnidn bdmaxd, bdmaxm + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + lea r6d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r5, r6 + mova xm6, [base+pb_mask] + vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] + vpbroadcastw xm7, [base+round+r5*2-2] + pxor xm0, xm2 + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov r5, -44*2 +%else + mov r5, -82*73*2 + sub bufq, r5 +%endif +.loop_x: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + movq r10, xm2 + movzx r9d, r10w + movd xm2, [r6+r9*2] + rorx r9, r10, 32 + shr r10d, 16 + pinsrw xm2, [r6+r10*2], 1 + movzx r10d, r9w + pinsrw xm2, [r6+r10*2], 2 + shr r9d, 16 + pinsrw xm2, [r6+r9*2], 3 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + movq [bufq+r5], xm2 + add r5, 8 + jl .loop_x +%if %2 + add bufq, 82*2 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r6, [fg_dataq+FGData.ar_coeff_lag] + movsxd r6, [r8+r6*4] + add r6, r8 + jmp r6 + +INIT_YMM avx2 +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] + sar bdmaxd, 1 + vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] + movd xm6, bdmaxd + pcmpeqw m7, m7 + pmaddubsw m4, m0 ; ar_coeff << (14 - shift) + vpbroadcastw m6, xm6 ; max_gain + pxor m7, m6 ; min_grain + DEFINE_ARGS buf, bufy, h, x +%if %2 + vpbroadcastw m5, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm0, [bufyq+16*0] + vinserti128 m0, [bufyq+16*2], 1 + movu xm1, [bufyq+16*1] + vinserti128 m1, [bufyq+16*3], 1 +%if %3 + movu xm2, [bufyq+82*2+16*0] + vinserti128 m2, [bufyq+82*2+16*2], 1 + movu xm3, [bufyq+82*2+16*1] + vinserti128 m3, [bufyq+82*2+16*3], 1 + paddw m0, m2 + paddw m1, m3 +%endif + phaddw m0, m1 + movu xm1, [bufyq+16*4] + vinserti128 m1, [bufyq+16*6], 1 + movu xm2, [bufyq+16*5] + vinserti128 m2, [bufyq+16*7], 1 +%if %3 + movu xm3, [bufyq+82*2+16*4] + vinserti128 m3, [bufyq+82*2+16*6], 1 + paddw m1, m3 + movu xm3, [bufyq+82*2+16*5] + vinserti128 m3, [bufyq+82*2+16*7], 1 + paddw m2, m3 +%endif + phaddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 +%else + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*2] + movu m1, [bufyq+xq*2+32] +%endif + paddw m0, m0 + paddw m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 +%if %2 + paddw m0, [bufq+ 0] + paddw m1, [bufq+32] +%else + paddw m0, [bufq+xq*2+ 0] + paddw m1, [bufq+xq*2+32] +%endif + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 +%if %2 + movu [bufq+ 0], m0 + movu [bufq+32], m1 + + ; last 6 pixels + movu xm0, [bufyq+32*4] + movu xm1, [bufyq+32*4+16] +%if %3 + paddw xm0, [bufyq+32*4+82*2] + paddw xm1, [bufyq+32*4+82*2+16] +%endif + phaddw xm0, xm1 + movu xm1, [bufq+32*2] + pmulhrsw xm0, xm5 + paddw xm0, xm0 + pmulhrsw xm0, xm4 + paddw xm0, xm1 + pminsw xm0, xm6 + pmaxsw xm0, xm7 + vpblendd xm0, xm1, 0x08 + movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m0 + movu [bufq+xq*2+32], m1 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m0, [bufyq+64*2] + movu m1, [bufq+64*2] + paddw m0, m0 + pmulhrsw m0, m4 + paddw m0, m1 + pminsw m0, m6 + pmaxsw m0, m7 + vpblendd m0, m1, 0xc0 + movu [bufq+64*2], m0 +%endif + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar0 + RET + +INIT_XMM avx2 +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd + vpbroadcastw xm6, [base+hmul_bits+2+%3*2] + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu xm2, [bufyq+xq*4] +%else + movq xm2, [bufyq+xq*2] +%endif +%if %2 +%if %3 + phaddw xm2, [bufyq+xq*4+82*2] + punpckhqdq xm1, xm2, xm2 + paddw xm2, xm1 +%else + phaddw xm2, xm2 +%endif + pmulhrsw xm2, xm6 +%endif + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm1, xm2 + psrldq xm2, xm0, 2 ; top + punpcklwd xm0, xm2 + pmaddwd xm1, xm5 + pmaddwd xm0, xm4 + paddd xm1, xm3 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 + RET + +INIT_YMM avx2 +.ar2: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign xmm_regs_used 13 + %2 + %assign stack_size_padded 136 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 +%if %2 + movaps [rsp+16*7], xmm13 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vbroadcasti128 m10, [base+gen_shufA] + sar bdmaxd, 1 + vbroadcasti128 m11, [base+gen_shufB] + movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 + pinsrb xm7, [base+pb_1], 5 + pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 + pmovsxbw m7, xm7 + movd xm8, bdmaxd ; max_grain + pshufd m4, m7, q0000 + vpbroadcastw xm12, [base+round_vals-12+shiftq*2] + pshufd m5, m7, q1111 + pcmpeqd xm9, xm9 + pshufd m6, m7, q2222 + pxor xm9, xm8 ; min_grain + pshufd xm7, xm7, q3333 + DEFINE_ARGS buf, bufy, fg_data, h, x +%if %2 + vpbroadcastw xm13, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) +.x_loop_ar2: + vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m4 + pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m5 + punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm13 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd xm3, xm12 ; luma, round interleaved + vpblendd m2, m3, 0x0f + pmaddwd m2, m6 + paddd m1, m0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd m2, m1 + vextracti128 xm1, m2, 1 + paddd xm2, xm1 + pshufd xm1, xm0, q3321 + pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd xm3, xm7, xm0 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd xm3, xm1 + psrldq xm1, 4 + pminsd xm3, xm8 + pmaxsd xm3, xm9 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm0, 2 + pslldq xm3, 2 + pblendw xm0, xm3, 00000010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign stack_offset 32 + %assign xmm_regs_used 14 + %2 + %assign stack_size_padded 152 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 + movaps [rsp+16*7], xmm13 +%if %2 + movaps [rsp+16*8], xmm14 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm11, [base+round_vals-12+shiftq*2] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] + pmovsxbw m7, xm7 +%if %2 + vpbroadcastw xm14, [base+hmul_bits+2+%3*2] +%endif + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] + pinsrb xm0, [base+pb_1], 3 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 + pmovsxbw m0, xm0 + movd xm12, bdmaxd ; max_grain + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pcmpeqd xm13, xm13 + punpckhqdq xm10, xm0, xm0 + pxor xm13, xm12 ; min_grain + pinsrw xm10, [base+round_vals-10+shiftq*2], 3 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) +.x_loop_ar3: + movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] + punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + paddd m0, m3 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm14 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd m1, m3 + pmaddwd m1, m7 + paddd m0, m1 + psrldq m1, m2, 4 + psrldq m3, m2, 6 + vpblendd m3, m11, 0x0f ; rounding constant + punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + psrldq m3, m2, 2 + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd m0, m2 + vextracti128 xm2, m0, 1 + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + psrldq xm1, 2 + ; no need to packssdw since we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + pblendw xm1, xm2, 00000100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, unused, sby, see +%define base r11-grain_min + lea r11, [grain_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m8, r9m + shr r6d, 11 ; is_12bpc + vpbroadcastd m9, [base+grain_min+r6*4] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+grain_max+r6*4] + lea r9d, [r6+r9*4] + vpbroadcastw m11, [base+mul_bits+r7*2-12] + vpbroadcastd m12, [base+fg_min+r10*4] + vpbroadcastd m13, [base+fg_max+r9*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m14, [base+pd_16] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + movq xm7, [pw_27_17_17_27] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + punpcklwd xm6, xm3 + pmaddwd xm6, xm7 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + vpblendd m3, m6, 0x01 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, m3 + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see, src_bak + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_v_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + pmaddwd m4, m15 + pmaddwd m5, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m4, m14 + paddd m5, m14 + psrad m4, 5 + psrad m5, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_v_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_v_overlap + jmp .loop_y +.end_y_v_overlap: + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_hv_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + movd xm4, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd xm6, xm7 + punpcklwd xm4, xm5 + punpcklqdq xm6, xm4 + movddup xm4, [pw_27_17_17_27] + pmaddwd xm6, xm4 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + pshuflw xm4, xm6, q1032 + vpblendd m6, m7, 0xfe + vpblendd m4, m5, 0xfe + ; followed by v interpolation (top | cur -> cur) + punpckhwd m5, m7 + pmaddwd m5, m15 + punpcklwd m4, m6 + pmaddwd m4, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m5, m14 + paddd m4, m14 + psrad m5, 5 + psrad m4, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_hv_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_hv_overlap + movq xm7, [pw_27_17_17_27] + jmp .loop_y_h_overlap +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .loop_x_hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-grain_min + lea r12, [grain_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r11d, is_idm + mov sbyd, sbym + vpbroadcastw m11, [base+mul_bits+r7*2-12] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + shr r9d, 11 ; is_12bpc + vpbroadcastd m8, [base+grain_min+r9*4] + shlx r10d, r6d, r9d + vpbroadcastd m9, [base+grain_max+r9*4] + vpbroadcastw m10, r13m + shlx r6d, r6d, r11d + vpbroadcastd m12, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m13, [base+fg_max+r6*4] + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused, sby, see, overlap + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+pb_8_9_0_1] + vpbroadcastd m1, [base+uv_offset_mul+r9*4] + vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m0 ; { uv_luma_mult, uv_mult } + pmaddwd m15, m1 +%else +%if %2 + vpbroadcastq m15, [base+pw_23_22] +%else + vpbroadcastq m15, [base+pw_27_17_17_27] +%endif + vpbroadcastd m14, [base+pd_16] +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq+ 0] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 + pminuw m3, m10 ; clip_pixel() +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] +%if %2 + pmulhrsw m5, [grain_lutq+offxyq*2+82*2] +%else + pmulhrsw m5, [grain_lutq+offxyq*2+32] +%endif + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m4 + paddw m1, m5 +%else + paddw m0, m4, [srcq] +%if %2 + paddw m1, m5, [srcq+strideq] +%else + paddw m1, m5, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + movd xm6, [grain_lutq+left_offxyq*2] +%if %2 + pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm2, xm3 ; {cur0, cur1} + punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm6, xm2 +%endif +%if %1 +%if %2 + vpbroadcastq xm7, [pw_23_22] +%else + movq xm7, [pw_27_17_17_27] +%endif + pmaddwd xm6, xm7 + vpbroadcastd xm7, [pd_16] + paddd xm6, xm7 +%else + pmaddwd xm6, xm15 + paddd xm6, xm14 +%endif + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm8 + pminsw xm6, xm9 + vpblendd m2, m6, 0x01 +%if %2 + pshuflw xm6, xm6, q1032 + vpblendd m3, m6, 0x01 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m2 + paddw m1, m3 +%else + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif +%%loop_y_v_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+top_offxyq*2] + punpcklwd m2, m3, m6 + punpckhwd m3, m6 ; { top, cur } +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r10] +%endif + REPX {pmaddwd x, m0}, m2, m3 +%if %1 + vpbroadcastd m1, [pd_16] + REPX {paddd x, m1}, m2, m3 +%else + REPX {paddd x, m14}, m2, m3 +%endif + REPX {psrad x, 5}, m2, m3 + packssdw m2, m3 +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + movu m7, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m6, m3, m7 ; { cur, top } + punpcklwd m3, m7 +%else + movu m7, [grain_lutq+top_offxyq*2+32] + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + pmaddwd m6, m0 + pmaddwd m3, m0 +%if %1 + paddd m6, m1 + paddd m3, m1 +%else + paddd m6, m14 + paddd m3, m14 +%endif + psrad m6, 5 + psrad m3, 5 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + jmp %%loop_y +%else + add hd, 0x80000000 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif +%%end_y_v_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + +%if %2 == 0 + lea r14, [pw_27_17_17_27] +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%%loop_y_hv_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m0, [grain_lutq+offxyq*2] + movd xm2, [grain_lutq+left_offxyq*2] + movu m6, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 + movu m3, [grain_lutq+offxyq*2+82*2] + punpckldq xm1, xm0, xm3 ; { cur0, cur1 } +%if %3 + vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } + vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } +%else + vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 + vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] + vpblendd m2, m7, 0x20 + movd xm7, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm7, xm6 + vinserti128 m1, xm7, 1 + movu m7, [grain_lutq+top_offxyq*2+82*2] +%endif + punpcklwd m2, m1 ; { cur, left } +%if %1 + vpbroadcastq m1, [pw_23_22] + pmaddwd m2, m1 + vpbroadcastd m1, [pd_16] + paddd m2, m1 + psrad m2, 5 + packssdw m2, m2 + vpermq m2, m2, q3120 +%else + pmaddwd m2, m15 + paddd m2, m14 + psrad m2, 5 + vextracti128 xm1, m2, 1 + packssdw xm2, xm1 +%endif +%else + pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 + movu m3, [grain_lutq+offxyq*2+32] + movu m7, [grain_lutq+top_offxyq*2+32] + punpckldq xm1, xm0, xm6 + punpcklwd xm2, xm1 ; { cur, left } +%if %1 + movddup xm1, [pw_27_17_17_27] + pmaddwd xm2, xm1 + vpbroadcastd m1, [pd_16] + paddd xm2, xm1 +%else + pmaddwd xm2, xm15 + paddd xm2, xm14 +%endif + psrad xm2, 5 + packssdw xm2, xm2 +%endif + pmaxsw xm2, xm8 + pminsw xm2, xm9 + vpblendd m0, m2, 0x01 +%if %2 + pshufd xm2, xm2, q0321 + vpblendd m3, m2, 0x01 +%if %3 == 0 + pshufd xm2, xm2, q0321 + vpblendd m7, m2, 0x01 +%endif +%endif + pshuflw xm2, xm2, q1032 + vpblendd m2, m6, 0xfe + punpckhwd m6, m0 ; { top, cur } + punpcklwd m2, m0 +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r14] +%endif + pmaddwd m6, m0 + pmaddwd m2, m0 +%if %1 + paddd m6, m1 + paddd m2, m1 +%else + paddd m6, m14 + paddd m2, m14 +%endif + psrad m6, 5 + psrad m2, 5 + packssdw m2, m6 + +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + punpckhwd m6, m3, m7 + punpcklwd m3, m7 ; { cur, top } +%else + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + REPX {pmaddwd x, m0}, m6, m3 +%if %1 + REPX {paddd x, m1}, m6, m3 +%else + REPX {paddd x, m14}, m6, m3 +%endif + REPX {psrad x, 5}, m6, m3 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 + jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + add hd, 0x80000000 + jc %%loop_y_h_overlap + add r14, 4 + jmp %%loop_y_hv_overlap +%endif +%%end_y_hv_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%loop_x_hv_overlap +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +GEN_GRAIN_UV_FN 420, 1, 1 +FGUV_FN 420, 1, 1 +GEN_GRAIN_UV_FN 422, 1, 0 +FGUV_FN 422, 1, 0 +GEN_GRAIN_UV_FN 444, 0, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 |