; Copyright © 2019-2022, VideoLAN and dav1d authors ; Copyright © 2019-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 ; note: the order of (some of) the following constants matter pb_27_17: times 2 db 27, 17 byte_blend: db 0, 0, 0, -1 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_17_27: times 2 db 17, 27 pb_1: times 4 db 1 pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 fg_min: times 4 db 0 times 4 db 16 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 pd_m65536: dd -65536 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 pw_1: dw 1 %macro JMP_TABLE 2-* %1_8bpc_%2_table: %xdefine %%base %1_8bpc_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %rep %0 - 2 dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 SECTION .text INIT_YMM avx2 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data %define base r4-generate_grain_y_8bpc_avx2_table lea r4, [generate_grain_y_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mov r7, -73*82 mova xm6, [base+pb_mask] sub bufq, r7 vpbroadcastw xm7, [base+round+r6*2] lea r6, [gaussian_sequence] movsxd r5, [r4+r5*4] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm3, xm2 ; 4 next output seeds pshuflw xm0, xm3, q3333 psrlw xm3, 5 pand xm2, xm0, xm1 movq r2, xm3 psrlw xm3, xm2, 10 por xm2, xm3 pmullw xm2, xm4 pmulhuw xm0, xm5 movzx r3d, r2w pshufb xm3, xm6, xm2 psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm0, xm2 movd xm2, [r6+r3*2] rorx r3, r2, 32 por xm3, xm0 shr r2d, 16 pinsrw xm2, [r6+r2*2], 1 pshuflw xm0, xm3, q3333 movzx r2d, r3w psrlw xm3, 5 pinsrw xm2, [r6+r2*2], 2 shr r3d, 16 movq r2, xm3 pinsrw xm2, [r6+r3*2], 3 movzx r3d, r2w pinsrw xm2, [r6+r3*2], 4 rorx r3, r2, 32 shr r2d, 16 pinsrw xm2, [r6+r2*2], 5 movzx r2d, r3w pinsrw xm2, [r6+r2*2], 6 shr r3d, 16 pinsrw xm2, [r6+r3*2], 7 pmulhrsw xm2, xm7 packsswb xm2, xm2 movq [bufq+r7], xm2 add r7, 8 jl .loop ; auto-regression code add r5, r4 jmp r5 .ar1: DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm5, [fg_dataq+FGData.ar_coeffs_y] mova xm2, [base+gen_shufC] DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 pinsrb xm5, [base+pb_1], 3 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd pmovsxbw xm5, xm5 pshufd xm4, xm5, q0000 pshufd xm5, xm5, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm1, [bufq+xq-82-3] pshufb xm0, xm1, xm2 punpckhwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d movsx val0d, byte [bufq+xq] sarx val3d, val3d, shiftd add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if WIN64 %assign stack_size_padded 168 SUB rsp, stack_size_padded WIN64_PUSH_XMM 16, 8 %endif DEFINE_ARGS buf, fg_data, h, x mov r6d, [fg_dataq+FGData.ar_coeff_shift] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 vpbroadcastd xm10, [base+round_vals-14+r6*2] movd xm11, [base+byte_blend+1] pmovsxbw xm9, xm9 pshufd xm4, xm7, q0000 mova xm12, [base+gen_shufA] pshufd xm5, xm7, q3333 mova xm13, [base+gen_shufB] pshufd xm6, xm7, q1111 mova xm14, [base+gen_shufC] pshufd xm7, xm7, q2222 mova xm15, [base+gen_shufD] pshufd xm8, xm9, q0000 psrld xm10, 16 pshufd xm9, xm9, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, xm12 pmaddwd xm2, xm4 pshufb xm3, xm1, xm13 pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, xm14 pmaddwd xm3, xm6 punpckhqdq xm0, xm0 punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, xm15 pmaddwd xm1, xm8 paddd xm2, xm10 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm1, xm0 pmaddwd xm3, xm9, xm1 psrldq xm1, 4 ; y=0,x=0 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw xm3, xm1 packsswb xm3, xm3 pextrb [bufq+xq], xm3, 0 pslldq xm3, 2 vpblendvb xm0, xm3, xm11 psrldq xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: %if WIN64 ALLOC_STACK 16*14 %assign stack_size stack_size - 16*4 WIN64_PUSH_XMM 12, 8 %else ALLOC_STACK 16*12 %endif mov r6d, [fg_dataq+FGData.ar_coeff_shift] movd xm11, [base+byte_blend] pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pshufd m0, m1, q0000 mova [rsp+16* 0], m0 pshufd m0, m1, q1111 mova [rsp+16* 2], m0 pshufd m0, m1, q2222 mova [rsp+16* 4], m0 pshufd m1, m1, q3333 mova [rsp+16* 6], m1 pshufd xm0, xm2, q0000 mova [rsp+16* 8], xm0 pshufd xm0, xm2, q1111 mova [rsp+16* 9], xm0 psrldq xm7, xm2, 10 mova m8, [base+gen_shufA] pinsrw xm2, [base+pw_1], 5 mova m9, [base+gen_shufC] pshufd xm2, xm2, q2222 movu m10, [base+gen_shufE] vpbroadcastw xm6, [base+round_vals-12+r6*2] pinsrw xm7, [base+round_vals+r6*2-10], 3 mova [rsp+16*10], xm2 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] punpcklbw m3, m5, m5 punpckhwd m5, m4 psraw m3, 8 punpcklbw m5, m5 psraw m5, 8 punpcklbw xm4, xm4 psraw xm4, 8 pshufb m0, m3, m8 pmaddwd m0, [rsp+16*0] pshufb m1, m3, m9 pmaddwd m1, [rsp+16*2] shufps m2, m3, m5, q1032 paddd m0, m1 pshufb m1, m2, m8 vperm2i128 m3, m4, 0x21 pmaddwd m1, [rsp+16*4] shufps xm2, xm3, q1021 vpblendd m2, m3, 0xf0 pshufb m2, m10 paddd m0, m1 pmaddwd m2, [rsp+16*6] pshufb xm1, xm4, xm9 pmaddwd xm1, [rsp+16*8] shufps xm4, xm5, q1132 paddd m0, m2 pshufb xm2, xm4, xm8 pshufd xm4, xm4, q2121 pmaddwd xm2, [rsp+16*9] punpcklwd xm4, xm6 pmaddwd xm4, [rsp+16*10] vextracti128 xm3, m0, 1 paddd xm0, xm1 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] paddd xm2, xm4 paddd xm0, xm2 paddd xm0, xm3 .x_loop_ar3_inner: pmovsxbw xm2, xm1 pmaddwd xm2, xm7 pshufd xm3, xm2, q1111 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb xm2, xm2 pextrb [bufq+xq], xm2, 0 pslldq xm2, 3 vpblendvb xm1, xm2, xm11 psrldq xm1, 1 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv %define base r4-generate_grain_uv_%1_8bpc_avx2_table lea r4, [generate_grain_uv_%1_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mova xm6, [base+pb_mask] vpbroadcastw xm7, [base+round+r6*2] vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] pxor xm0, xm2 lea r6, [gaussian_sequence] %if %2 mov r7d, 73-35*%3 add bufq, 44 .loop_y: mov r5, -44 %else mov r5, -73*82 sub bufq, r5 %endif .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 movq r8, xm2 movzx r9d, r8w movd xm2, [r6+r9*2] rorx r9, r8, 32 shr r8d, 16 pinsrw xm2, [r6+r8*2], 1 movzx r8d, r9w pinsrw xm2, [r6+r8*2], 2 shr r9d, 16 pinsrw xm2, [r6+r9*2], 3 pmulhrsw xm2, xm7 packsswb xm2, xm2 movd [bufq+r5], xm2 add r5, 4 jl .loop %if %2 add bufq, 82 dec r7d jg .loop_y %endif ; auto-regression code movsxd r6, [fg_dataq+FGData.ar_coeff_lag] movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] add r6, r4 jmp r6 INIT_YMM avx2 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd xm3, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h pmovsxbw xm2, xm2 %if %2 vpbroadcastd m7, [base+pb_1] vpbroadcastw m6, [base+hmul_bits+2+%3*2] %endif vpbroadcastw m2, xm2 vpbroadcastw m3, xm3 pxor m12, m12 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: %if %2 ; first 32 pixels movu xm4, [bufyq] vinserti128 m4, [bufyq+32], 1 %if %3 movu xm0, [bufyq+82] vinserti128 m0, [bufyq+82+32], 1 %endif movu xm5, [bufyq+16] vinserti128 m5, [bufyq+48], 1 %if %3 movu xm1, [bufyq+82+16] vinserti128 m1, [bufyq+82+48], 1 %endif pmaddubsw m4, m7, m4 %if %3 pmaddubsw m0, m7, m0 %endif pmaddubsw m5, m7, m5 %if %3 pmaddubsw m1, m7, m1 paddw m4, m0 paddw m5, m1 %endif pmulhrsw m4, m6 pmulhrsw m5, m6 %else xor r3d, r3d ; first 32x2 pixels .x_loop_ar0: movu m4, [bufyq+r3] pcmpgtb m0, m12, m4 punpckhbw m5, m4, m0 punpcklbw m4, m0 %endif pmullw m4, m2 pmullw m5, m2 pmulhrsw m4, m3 pmulhrsw m5, m3 %if %2 movu m1, [bufq] %else movu m1, [bufq+r3] %endif pcmpgtb m8, m12, m1 punpcklbw m0, m1, m8 punpckhbw m1, m8 paddw m0, m4 paddw m1, m5 packsswb m0, m1 %if %2 movu [bufq], m0 %else movu [bufq+r3], m0 add r3d, 32 cmp r3d, 64 jl .x_loop_ar0 %endif ; last 6/12 pixels movu xm4, [bufyq+32*2] %if %2 %if %3 movu xm5, [bufyq+32*2+82] %endif pmaddubsw xm4, xm7, xm4 %if %3 pmaddubsw xm5, xm7, xm5 paddw xm4, xm5 %endif movq xm0, [bufq+32] pmulhrsw xm4, xm6 pmullw xm4, xm2 pmulhrsw xm4, xm3 pcmpgtb xm5, xm12, xm0 punpcklbw xm5, xm0, xm5 paddw xm4, xm5 packsswb xm4, xm4 pblendw xm0, xm4, xm0, 1000b movq [bufq+32], xm0 %else movu xm0, [bufq+64] pcmpgtb xm1, xm12, xm4 punpckhbw xm5, xm4, xm1 punpcklbw xm4, xm1 pmullw xm5, xm2 pmullw xm4, xm2 vpblendd xm1, xm3, xm12, 0x0c pmulhrsw xm5, xm1 pmulhrsw xm4, xm3 pcmpgtb xm1, xm12, xm0 punpckhbw xm8, xm0, xm1 punpcklbw xm0, xm1 paddw xm5, xm8 paddw xm0, xm4 packsswb xm0, xm5 movu [bufq+64], xm0 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET INIT_XMM avx2 .ar1: DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd %if %2 vpbroadcastd xm7, [base+pb_1] vpbroadcastw xm6, [base+hmul_bits+2+%3*2] %endif vpbroadcastd xm3, xm3 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left %if %2 movq xm8, [bufyq+xq*2] %if %3 movq xm9, [bufyq+xq*2+82] %endif %endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right %if %2 pmaddubsw xm8, xm7, xm8 %if %3 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 %endif pmulhrsw xm8, xm6 %else pmovsxbw xm8, [bufyq+xq] %endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 paddd xm0, xm3 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar1 RET .ar2: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vpbroadcastw xm13, [base+round_vals-12+shiftq*2] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 pinsrw xm0, [base+pw_1], 5 %if %2 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] vpbroadcastd xm11, [base+pb_1] %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd xm4, xm7, q0000 pshufd xm5, xm7, q3333 pshufd xm6, xm7, q1111 pshufd xm7, xm7, q2222 pshufd xm8, xm0, q0000 pshufd xm9, xm0, q1111 pshufd xm10, xm0, q2222 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, [base+gen_shufA] pmaddwd xm2, xm4 pshufb xm3, xm1, [base+gen_shufB] pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, [base+gen_shufC] pmaddwd xm3, xm6 punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, [gen_shufD] pmaddwd xm1, xm8 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 %if %2 movq xm0, [bufyq+xq*2] %if %3 movq xm3, [bufyq+xq*2+82] %endif pmaddubsw xm0, xm11, xm0 %if %3 pmaddubsw xm3, xm11, xm3 paddw xm0, xm3 %endif pmulhrsw xm0, xm12 %else pmovsxbw xm0, [bufyq+xq] %endif punpcklwd xm0, xm13 pmaddwd xm0, xm10 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm0, xm0 pmaddwd xm3, xm0, xm9 psrldq xm0, 2 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] pslldq xm3, 2 paddw xm3, xm0 pblendw xm0, xm3, 00000010b packsswb xm0, xm0 pextrb [bufq+xq], xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] movd xm13, [base+round_vals-10+shiftq*2] vpbroadcastd xm14, [base+round_vals-14+shiftq*2] pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m8, m0, q2222 pshufd m9, m0, q3333 pshufd xm10, xm1, q0000 pshufd xm11, xm1, q1111 pshufhw xm12, xm1, q0000 psraw xm2, 8 palignr xm13, xm1, 10 punpckhwd xm12, xm2 ; interleave luma cf psrld xm14, 16 DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] vpblendd m3, m1, 0x0f pxor m0, m0 pcmpgtb m2, m0, m3 pcmpgtb m0, m4 punpcklbw m1, m3, m2 punpckhbw m3, m2 punpcklbw m2, m4, m0 punpckhbw xm4, xm0 pshufb m0, m1, [base+gen_shufA] pmaddwd m0, m6 pshufb m5, m1, [base+gen_shufC] pmaddwd m5, m7 shufps m1, m3, q1032 paddd m0, m5 pshufb m5, m1, [base+gen_shufA] pmaddwd m5, m8 shufps xm1, xm3, q2121 vpblendd m1, m2, 0xf0 pshufb m1, [base+gen_shufE] pmaddwd m1, m9 paddd m0, m5 pshufb xm3, xm2, [base+gen_shufC] paddd m0, m1 pmaddwd xm3, xm10 palignr xm1, xm4, xm2, 2 punpckhwd xm1, xm2, xm1 pmaddwd xm1, xm11 palignr xm4, xm2, 12 paddd xm3, xm1 %if %2 vpbroadcastd xm5, [base+pb_1] movq xm1, [bufyq+xq*2] pmaddubsw xm1, xm5, xm1 %if %3 movq xm2, [bufyq+xq*2+82] pmaddubsw xm5, xm2 paddw xm1, xm5 %endif pmulhrsw xm1, xm15 %else pmovsxbw xm1, [bufyq+xq] %endif punpcklwd xm4, xm1 pmaddwd xm4, xm12 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] vextracti128 xm2, m0, 1 paddd xm0, xm14 paddd xm3, xm4 paddd xm0, xm3 paddd xm0, xm2 .x_loop_ar3_inner: pmovsxbw xm1, xm1 pmaddwd xm2, xm13, xm1 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] psrldq xm1, 2 ; don't packssdw, we only care about one value punpckldq xm2, xm2 pblendw xm1, xm2, 0100b packsswb xm1, xm1 pextrb [bufq+xq], xm1, 2 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro INIT_YMM avx2 cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r9-pd_m65536 lea r9, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] vpbroadcastd m11, [base+fg_max+r7*8] vpbroadcastd m12, [base+pw_1024] movq xm13, [base+pb_27_17_17_27] test sbyd, sbyd setnz r7b pxor m7, m7 test r7b, overlapb jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap mov hd, hm mov grain_lutq, grain_lutmp .loop_y: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 pmaddubsw xm4, xm13, xm4 pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap jmp .loop_x_h_overlap .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x_v_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_v_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] punpcklbw m5, m4, m6 punpckhbw m4, m6 pmaddubsw m5, m14, m5 pmaddubsw m4, m14, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_v_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+32] lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_hv_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movd xm7, [grain_lutq+left_offxyq] movu m4, [grain_lutq+top_offxyq] movd xm5, [grain_lutq+topleft_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw xm7, xm6 punpcklbw xm5, xm4 pmaddubsw xm7, xm13, xm7 pmaddubsw xm5, xm13, xm5 pmulhrsw xm7, xm12 pmulhrsw xm5, xm12 packsswb xm7, xm7 packsswb xm5, xm5 vpblendd m7, m6, 0xfe vpblendd m5, m4, 0xfe ; followed by v interpolation (top | cur -> cur) punpckhbw m4, m6 punpcklbw m5, m7 pmaddubsw m4, m14, m4 pmaddubsw m5, m14, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 pxor m7, m7 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_hv_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq] jl .loop_x_hv_overlap .end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, overlap, uv_pl, is_id %define base r11-pd_m65536 lea r11, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m11, [base+fg_max+r7*4] vpbroadcastd m12, [base+pw_1024] pxor m7, m7 test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, sby, see, overlap, uv_pl %if %1 mov r6d, uv_plm vpbroadcastd m0, [base+pw_8] vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m0 ; uv_luma_mult, uv_mult %elif %2 vpbroadcastq m15, [base+pb_23_22] %else vpbroadcastq xm15, [base+pb_27_17_17_27] %endif %if %3 vpbroadcastw m13, [base+pb_23_22] %elif %2 pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 %endif test r7b, overlapb jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, unused5, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 %else movu m5, [grain_lutq+offxyq] %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 movd xm4, [grain_lutq+left_offxyq+ 0] vinserti128 m4, [grain_lutq+left_offxyq+82], 1 punpcklbw m4, m5 %if %1 vpbroadcastq m0, [pb_23_22] pmaddubsw m4, m0, m4 %else pmaddubsw m4, m15, m4 %endif pmulhrsw m4, m12 packsswb m4, m4 vpblendd m4, m5, 0xee %else movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 %if %1 movq xm0, [pb_27_17_17_27] pmaddubsw xm4, xm0, xm4 %else pmaddubsw xm4, xm15, xm4 %endif pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe %endif punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(1+%2) sub hb, 1+%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, overlap, unused1, unused2, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused1, unused2, see, overlap, unused3, unused4, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_v_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %3 == 0 %if %2 movu xm0, [grain_lutq+offxyq] vinserti128 m0, [grain_lutq+offxyq+82], 1 movu xm4, [grain_lutq+top_offxyq] vinserti128 m4, [grain_lutq+top_offxyq+82], 1 %else movu m0, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] %endif punpcklbw m5, m4, m0 punpckhbw m4, m0 pmaddubsw m5, m13, m5 pmaddubsw m4, m13, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 %else movq xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+8], 1 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 punpcklbw m5, m4 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm4, m5, 1 packsswb xm5, xm4 ; only interpolate first line, insert second line unmodified vinserti128 m5, [grain_lutq+offxyq+82], 1 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif sub hb, 1+%2 jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 %if %2 == 0 vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_hv_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+82], 1 movd xm0, [grain_lutq+left_offxyq] vinserti128 m0, [grain_lutq+left_offxyq+82], 1 movd xm6, [grain_lutq+topleft_offxyq] %if %3 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 %else vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 movu xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+82], 1 %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m0, m4 %if %3 punpcklbw xm6, xm5 %else punpcklbw m6, m5 %endif punpcklqdq m0, m6 %if %1 vpbroadcastq m6, [pb_23_22] pmaddubsw m0, m6, m0 %else pmaddubsw m0, m15, m0 %endif pmulhrsw m0, m12 packsswb m0, m0 vpblendd m4, m0, 0x11 %if %3 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %else pshuflw m0, m0, q1032 vpblendd m5, m0, 0x11 %endif %else movu m4, [grain_lutq+offxyq] movd xm0, [grain_lutq+left_offxyq] movu m5, [grain_lutq+top_offxyq] movd xm6, [grain_lutq+topleft_offxyq] punpcklbw xm0, xm4 punpcklbw xm6, xm5 punpcklqdq xm0, xm6 %if %1 vpbroadcastq xm6, [pb_27_17_17_27] pmaddubsw xm0, xm6, xm0 %else pmaddubsw xm0, xm15, xm0 %endif pmulhrsw xm0, xm12 packsswb xm0, xm0 vpblendd m4, m0, 0x01 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %endif ; followed by v interpolation (top | cur -> cur) %if %3 vpermq m0, m4, q3120 punpcklbw m5, m0 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm0, m5, 1 packsswb xm5, xm0 vpblendd m5, m4, 0xf0 %else punpckhbw m0, m5, m4 punpcklbw m5, m4 pmaddubsw m4, m13, m0 pmaddubsw m5, m13, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 packsswb m5, m4 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 %if %2 jg %%loop_y_h_overlap %else je %%end_y_hv_overlap vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_hv_overlap jmp %%loop_y_h_overlap %endif %%end_y_hv_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 FGUV_FN 420, 1, 1 GEN_GRAIN_UV_FN 422, 1, 0 FGUV_FN 422, 1, 0 GEN_GRAIN_UV_FN 444, 0, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64