summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/filmgrain16_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/filmgrain16_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm3421
1 files changed, 3421 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
new file mode 100644
index 0000000000..6b0daaac0b
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -0,0 +1,3421 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA 16
+pd_16: times 4 dd 16
+pw_1: times 8 dw 1
+pw_16384: times 8 dw 16384
+pw_8192: times 8 dw 8192
+pw_23_22: dw 23, 22
+ times 3 dw 0, 32
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+pw_27_17_17_27: dw 27, 17, 17, 27
+ times 2 dw 0, 32
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512, 1024
+max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
+min: dw 0, 16*4, 16*16
+; these two should be next to each other
+pw_4: times 2 dw 4
+pw_16: times 2 dw 16
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%undef base
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 8
+%define %%tmp %8
+%endif
+%rep (%6/2)
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %6 == 8
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4*%7]
+%else
+ pinsrw %1, [%3+%4*%7], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5*%7], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+%macro SPLATD 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
+ lea r4, [pb_mask]
+%define base r4-pb_mask
+%else
+cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
+ LEA r4, $$
+%define base r4-$$
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r3d, [fg_dataq+FGData.grain_scale_shift]
+ lea r5d, [bdmaxq+1]
+ shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r3, r5
+ SPLATW m6, [base+round+r3*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+ mov r3, -73*82*2
+ sub bufq, r3
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+.loop:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r5, r7, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+r3], m3
+ add r3, 4*2
+ jl .loop
+
+ ; auto-regression code
+ movsxd r3, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
+ lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
+ jmp r3
+
+.ar1:
+%if WIN64
+ DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
+ lea bufq, [r0-2*(82*73-(82*3+79))]
+ PUSH r8
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+ DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
+ PUSH r6
+%define shiftd r1d
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+%if WIN64
+ DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
+%elif ARCH_X86_64
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+%undef shiftd
+ DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
+%define hd dword r0m
+%define maxd dword minm
+%endif
+%if cpuflag(sse4)
+ pmovsxbw m4, m4
+%else
+ pxor m3, m3
+ pcmpgtb m3, m4
+ punpcklbw m4, m3
+%endif
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+%if WIN64
+ POP r8
+%elif ARCH_X86_32
+ POP r6
+%undef maxd
+%undef hd
+%endif
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m0, [base+round_vals-12+shiftq*2]
+ pshuflw m0, m0, q0000
+ movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11
+ pxor m2, m2
+ punpcklwd m0, m2
+ pcmpgtb m2, m6
+ punpckhbw m3, m6, m2
+ punpcklbw m6, m2
+ pshufd m2, m6, q3333
+ pshufd m1, m6, q2222
+ pshufd m7, m6, q1111
+ pshufd m6, m6, q0000
+ pshufd m4, m3, q1111
+ pshufd m3, m3, q0000
+%if ARCH_X86_64
+ SWAP 0, 12
+ SWAP 1, 8
+ SWAP 2, 9
+ SWAP 3, 10
+ SWAP 4, 11
+%else
+%define m12 [rsp+0*16]
+%define m8 [rsp+1*16]
+%define m9 [rsp+2*16]
+%define m10 [rsp+3*16]
+%define m11 [rsp+4*16]
+ mova m12, m0
+ mova m8, m1
+ mova m9, m2
+ mova m10, m3
+ mova m11, m4
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m0, bdmaxd ; max_grain
+ pcmpeqw m1, m1
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ psrldq m2, 14
+ pslldq m2, 2
+ pxor m2, m1
+%endif
+ pxor m1, m0 ; min_grain
+%if ARCH_X86_64
+ SWAP 0, 13
+ SWAP 1, 14
+ SWAP 2, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+ mova m13, m0
+ mova m14, m1
+%if !cpuflag(sse4)
+%define m15 [rsp+7*16]
+ mova m15, m2
+%endif
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m2, m0, 2
+ psrldq m3, m0, 4
+ psrldq m4, m0, 6
+ psrldq m5, m0, 8
+ punpcklwd m0, m2
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ psrldq m2, m1, 2
+ psrldq m4, m1, 4
+ punpcklwd m2, m4
+ psrldq m4, m1, 6
+ psrldq m1, 8
+ punpcklwd m4, m1
+ pmaddwd m0, m6
+ pmaddwd m3, m7
+ pmaddwd m5, m8
+ pmaddwd m2, m9
+ pmaddwd m4, m10
+ paddd m0, m3
+ paddd m5, m2
+ paddd m0, m4
+ paddd m0, m5 ; accumulated top 2 rows
+ paddd m0, m12
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m4, m1, q3321
+ pxor m2, m2
+ pcmpgtw m2, m4
+ punpcklwd m4, m2 ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd m2, m1, m11
+ paddd m2, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ paddd m2, m4
+ packssdw m2, m2
+ pminsw m2, m13
+ pmaxsw m2, m14
+ psrldq m4, 4
+ pslldq m2, 2
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000010b
+%else
+ pand m1, m15
+ pandn m3, m15, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixel, this should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 64
+ %define tmp rsp
+%elif ARCH_X86_64
+ %define tmp rsp+stack_offset-72
+%else
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*12
+ %define tmp rsp
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m7, bdmaxd ; max_grain
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m4, m4
+ psrldq m4, 14
+ pslldq m4, 4
+ pxor m4, m6
+%endif
+ pxor m6, m7 ; min_grain
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+
+%if ARCH_X86_64
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m14 [rsp+10*16]
+%define m15 [esp+11*16]
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; build cf0-1 until 18-19 in m5-12 and r0/1
+ pxor m1, m1
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+
+%if cpuflag(sse4)
+ pshufd m4, m2, q3333
+%else
+ pshufd m5, m2, q3333
+ mova [tmp+48], m5
+%endif
+ pshufd m3, m2, q2222
+ pshufd m1, m2, q0000
+ pshufd m2, m2, q1111
+ pshufd m7, m0, q2222
+ pshufd m6, m0, q1111
+ pshufd m5, m0, q0000
+ pshufd m0, m0, q3333
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+4*16]
+%define m9 [esp+5*16]
+%define m10 [rsp+6*16]
+%define m11 [esp+7*16]
+%define m12 [rsp+8*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+ ; build cf20,round in r2
+ ; build cf21-23,round*2 in m13
+ pxor m1, m1
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ mova [tmp+ 0], m1
+ mova [tmp+16], m2
+ psrldq m3, m0, 10
+ pinsrw m3, [base+round_vals+shiftq*2-10], 3
+
+%if ARCH_X86_64
+ SWAP 3, 13
+%else
+%define m13 [esp+9*16]
+ mova m13, m3
+%endif
+
+ pinsrw m0, [base+round_vals+shiftq*2-12], 5
+ pshufd m3, m0, q2222
+ mova [tmp+32], m3
+
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
+
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6]
+ punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+
+ pmaddwd m1, m8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+ pmaddwd m2, m11
+ paddd m1, m4
+ paddd m3, m2
+ paddd m0, m1
+ paddd m0, m3
+ ; m0 = top 2 lines multiplied by cf
+
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, [base+pw_1]
+
+%if cpuflag(sse4)
+ pmaddwd m1, m12
+%else
+ pmaddwd m1, [tmp+48]
+%endif
+ pmaddwd m3, [tmp+ 0]
+ pmaddwd m4, [tmp+16]
+ pmaddwd m2, [tmp+32]
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m1
+ paddd m0, m4
+ ; m0 = top 3 lines multiplied by cf plus rounding for downshift
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m15
+ pmaxsw m2, m14
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m12
+ pandn m3, m12, m2
+ por m1, m3
+%endif
+ ; overwrite a couple of pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+ movifnidn bdmaxd, bdmaxm
+ lea r6d, [bdmaxq+1]
+%else
+cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
+%define base r2-$$
+ LEA r2, $$
+ mov fg_dataq, r2m
+ mov r6d, r4m
+ inc r6d
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r5, r6
+ SPLATW m6, [base+round+r5*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+%if ARCH_X86_64
+ SPLATW m2, [base+pw_seed_xor+uvq*4]
+%else
+ mov r5d, r3m
+ SPLATW m2, [base+pw_seed_xor+r5*4]
+%endif
+ pxor m0, m2
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+%if %2
+ mov hd, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov xq, -44
+%else
+ mov xq, -82*73
+ add bufq, 82*73*2
+%endif
+.loop_x:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r9, r10, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+xq*2], m3
+ add xq, 4
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec hd
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*2
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ SPLATW m3, [base+hmul_bits+shiftq*2-10]
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m1, bdmaxd ; max_gain
+%else
+ SPLATW m1, r4m
+ psraw m1, 1
+%endif
+ pcmpeqw m7, m7
+ pxor m7, m1 ; min_grain
+%if ARCH_X86_64
+ SWAP 1, 14
+ DEFINE_ARGS buf, bufy, h, x
+%else
+%define m14 [rsp+0*16]
+ mova m14, m1
+ DEFINE_ARGS buf, bufy, pic_reg, h, x
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATW m4, m4
+ pxor m5, m5
+%if %2
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ pslldq m2, 12
+%if ARCH_X86_64
+ SWAP 2, 12
+%else
+%define m12 [rsp+1*16]
+ mova m12, m2
+%endif
+%endif
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ ; first 32 pixels
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*(2<<%2)]
+%if %2
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ paddw m0, m2
+%endif
+ movu m1, [bufyq+xq*4 +16]
+%if %3
+ movu m2, [bufyq+xq*4+82*2+16]
+ paddw m1, m2
+%endif
+ phaddw m0, m1
+ pmulhrsw m0, m6
+%endif
+ punpckhwd m1, m0, m5
+ punpcklwd m0, m5
+ REPX {pmaddwd x, m4}, m0, m1
+ REPX {psrad x, 5}, m0, m1
+ packssdw m0, m1
+ pmulhrsw m0, m3
+ movu m1, [bufq+xq*2]
+ paddw m0, m1
+ pminsw m0, m14
+ pmaxsw m0, m7
+ cmp xd, 72-40*%2
+ je .end
+ movu [bufq+xq*2], m0
+ add xd, 8
+ jmp .x_loop_ar0
+
+ ; last 6/4 pixels
+.end:
+%if %2
+%if cpuflag(sse4)
+ pblendw m0, m1, 11000000b
+%else
+ pand m1, m12
+ pandn m2, m12, m0
+ por m0, m1, m2
+%endif
+ movu [bufq+xq*2], m0
+%else
+ movq [bufq+xq*2], m0
+%endif
+
+ add bufq, 82*2
+ add bufyq, 82*(2<<%3)
+ dec hd
+ jg .y_loop_ar0
+%if ARCH_X86_32
+%undef m12
+%undef m14
+%endif
+ RET
+
+.ar1:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
+%else
+%assign stack_offset stack_offset_old
+%xdefine rstk rsp
+%assign stack_size_padded 0
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+%if WIN64
+ DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
+%if %2
+ lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
+%else
+ lea bufq, [r0-2*(82*69+3)]
+%endif
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
+%else
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
+%define hd dword r1m
+%define mind dword r3m
+%define maxd dword r4m
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+%endif
+%if ARCH_X86_64
+ mov shiftd, [r2+FGData.ar_coeff_shift]
+%else
+ mov shiftd, [r3+FGData.ar_coeff_shift]
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5 ; cf0-4 in words
+ pshuflw m4, m4, q2100
+ psrldq m4, 2 ; cf0-3,4 in words
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pxor m6, m6
+ punpcklwd m3, m6
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATD m3, m3
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+%if ARCH_X86_64
+ mov mind, maxd
+ xor mind, -1
+%else
+ DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
+ mov r2, maxd
+ xor r2, -1
+ mov mind, r2
+%endif
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu m7, [bufyq+xq*4]
+%if %3
+ movu m1, [bufyq+xq*4+82*2]
+ phaddw m7, m1
+%else
+ phaddw m7, m7
+%endif
+%else
+ movq m7, [bufyq+xq*2]
+%endif
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+%if %2
+%if %3
+ pshufd m2, m7, q3232
+ paddw m7, m2
+%endif
+ pmulhrsw m7, m6
+%endif
+ punpcklwd m1, m7
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+%if ARCH_X86_32
+%undef maxd
+%undef mind
+%undef hd
+%endif
+ RET
+
+.ar2:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+ ALLOC_STACK -16*8
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m5, bdmaxd ; max_grain
+%else
+ SPLATW m5, r4m
+ psraw m5, 1
+%endif
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m7, m7
+ psrldq m7, 14
+ pslldq m7, 2
+ pxor m7, m6
+%endif
+ pxor m6, m5 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m7, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+%define m15 [rsp+7*16]
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; coef values
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pinsrw m2, [base+round_vals-12+shiftq*2], 5
+
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m1, m0, q3333
+ pshufd m0, m0, q2222
+ pshufd m3, m2, q1111
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+0*16]
+%define m9 [rsp+1*16]
+%define m10 [rsp+2*16]
+%define m11 [rsp+3*16]
+%define m12 [rsp+4*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m4, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m1, m0, 4 ; y=-2,x=[-0,+5]
+ psrldq m3, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m2, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ pmaddwd m2, m8
+ paddd m0, m1
+ paddd m0, m2
+ psrldq m3, m5, 2 ; y=-1,x=[-1,+5]
+ psrldq m1, m5, 4 ; y=-1,x=[-0,+5]
+ psrldq m4, m5, 6 ; y=-1,x=[+1,+5]
+ psrldq m2, m5, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ punpcklwd m4, m2
+ pmaddwd m3, m9
+ pmaddwd m4, m10
+ paddd m3, m4
+ paddd m0, m3
+
+ ; luma component & rounding
+%if %2
+ movu m1, [bufyq+xq*4]
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ phaddw m1, m2
+ pshufd m2, m1, q3232
+ paddw m1, m2
+%else
+ phaddw m1, m1
+%endif
+%if cpuflag(sse4)
+ pmulhrsw m1, m15
+%elif %3
+ pmulhrsw m1, [base+pw_8192]
+%else
+ pmulhrsw m1, [base+pw_16384]
+%endif
+%else
+ movq m1, [bufyq+xq*2]
+%endif
+ punpcklwd m1, [base+pw_1]
+ pmaddwd m1, m12
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m2, m1, q3321
+ pxor m3, m3
+ pcmpgtw m3, m2
+ punpcklwd m2, m3 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd m3, m1, m11
+ paddd m3, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd m3, m2
+ packssdw m3, m3
+ pminsw m3, m13
+ pmaxsw m3, m14
+ psrldq m1, 2
+ pslldq m3, 2
+ psrldq m2, 4
+%if cpuflag(sse4)
+ pblendw m1, m3, 00000010b
+%else
+ pand m1, m15
+ pandn m4, m15, m3
+ por m1, m4
+%endif
+ ; overwrite previous pixel, should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 96
+ %define tmp rsp
+%else
+ %define tmp rsp+stack_offset-120
+%endif
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+ mov bufyq, r1m
+ mov uvd, r3m
+ %define tmp rsp
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ SPLATW m4, [base+round_vals-12+shiftq*2]
+ pxor m5, m5
+ pcmpgtw m5, m4
+ punpcklwd m4, m5
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m6, bdmaxd ; max_grain
+%else
+ SPLATW m6, r4m
+ psraw m6, 1
+%endif
+ pcmpeqw m7, m7
+%if !cpuflag(sse4)
+ pcmpeqw m3, m3
+ psrldq m3, 14
+ pslldq m3, 4
+ pxor m3, m7
+%endif
+ pxor m7, m6 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m3, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m11 [rsp+ 9*16]
+%define m12 [rsp+10*16]
+%define m14 [rsp+12*16]
+%define m15 [rsp+13*16]
+ mova m11, m3
+ mova m12, m4
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; cf from y=-3,x=-3 until y=-3,x=-2
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m0, m0, q3333
+ pshufd m5, m2, q0000
+ pshufd m6, m2, q1111
+ mova [tmp+16*0], m1
+ mova [tmp+16*1], m3
+ mova [tmp+16*2], m4
+ mova [tmp+16*3], m0
+ mova [tmp+16*4], m5
+ mova [tmp+16*5], m6
+ pshufd m6, m2, q2222
+ pshufd m7, m2, q3333
+
+ ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1 ; luma
+ punpcklbw m0, m1
+ pshufd m3, m0, q3232
+ psrldq m5, m0, 10
+ ; y=0,x=[-3 to -1] + "1.0" for current pixel
+ pinsrw m5, [base+round_vals-10+shiftq*2], 3
+ ; y=-1,x=[-1 to +2]
+ pshufd m1, m0, q0000
+ pshufd m0, m0, q1111
+ ; y=-1,x=+3 + luma
+ punpcklwd m3, m2
+ pshufd m3, m3, q0000
+
+%if ARCH_X86_64
+ SWAP 1, 8
+ SWAP 0, 9
+ SWAP 3, 10
+ SWAP 5, 13
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+%define m8 [rsp+ 6*16]
+%define m9 [rsp+ 7*16]
+%define m10 [rsp+ 8*16]
+%define m13 [rsp+11*16]
+ mova m8, m1
+ mova m9, m0
+ mova m10, m3
+ mova m13, m5
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ ; first line
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, [tmp+0*16]
+ pmaddwd m2, [tmp+1*16]
+ pmaddwd m3, [tmp+2*16]
+ paddd m0, m2
+ paddd m0, m3 ; first 6 x of top y
+
+ ; second line [m0/1 are busy]
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5]
+ punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, [tmp+3*16]
+ pmaddwd m4, [tmp+4*16]
+ pmaddwd m3, [tmp+5*16]
+ pmaddwd m5, m6
+ paddd m1, m4
+ paddd m3, m5
+ paddd m0, m1
+ paddd m0, m3 ; top 2 lines
+
+ ; third line [m0 is busy] & luma + round
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+%if %2
+ movu m5, [bufyq+xq*4]
+%if %3
+ movu m4, [bufyq+xq*4+82*2]
+ phaddw m5, m4
+%else
+ phaddw m5, m5
+%endif
+%else
+ movq m5, [bufyq+xq*2]
+%endif
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+%if %3
+ pshufd m4, m5, q3232
+ paddw m5, m4
+%endif
+%if %2
+%if cpuflag(sse4)
+ pmulhrsw m5, m11
+%elif %3
+ pmulhrsw m5, [base+pw_8192]
+%else
+ pmulhrsw m5, [base+pw_16384]
+%endif
+%endif
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, m5
+ pmaddwd m1, m7
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+ pmaddwd m2, m10
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m12 ; += round
+ paddd m1, m4
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m14
+ pmaxsw m2, m15
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m11
+ pandn m3, m11, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+
+ mov r0m, r0
+ mov r2m, r1
+ mov r4m, r2
+ mov r6m, r3
+ mov r7m, r4
+ mov r8m, r5
+%else
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov scalingq, r5m
+ mov fg_dataq, r3m
+%if STACK_ALIGNMENT < mmsize
+ mov r6, r9m
+
+%define r9m [rsp+8*mmsize+ 4*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+
+ mov r9m, r6
+%endif
+ LEA r5, $$
+%define base r5-$$
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 3
+%else
+ DECLARE_REG_TMP 9, 10
+%endif
+ mov t0d, r9m ; bdmax
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t0d, [t0d*3]
+ lea t0d, [r6d*2+t0d]
+ SPLATW m4, [base+max+t0*2]
+ SPLATW m2, r9m
+
+ pcmpeqw m1, m1
+ psraw m7, m2, 1 ; max_grain
+ pxor m1, m7 ; min_grain
+ SPLATD m6, [base+pd_16]
+
+ SCRATCH 1, 9, 0
+ SCRATCH 2, 10, 1
+ SCRATCH 3, 11, 2
+ SCRATCH 4, 12, 3
+ SCRATCH 5, 13, 4
+ SCRATCH 6, 14, 5
+ SCRATCH 7, 15, 6
+
+ mova m6, [base+pw_27_17_17_27] ; for horizontal filter
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ movzx t0d, byte [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz .no_vertical_overlap
+ test sbyd, sbyd
+ jnz .vertical_overlap
+.no_vertical_overlap:
+ mov dword r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+%endif
+
+.loop_x_odd:
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp ; src += stride
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+%endif
+
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m4, [grain_lutq+r5*2]
+%else
+ movd m4, [grain_lutq+left_offxyq*2]
+%endif
+ punpcklwd m4, m5
+ pmaddwd m4, m6
+ paddd m4, m14
+ psrad m4, 5
+ packssdw m4, m4
+ pminsw m4, m15
+ pmaxsw m4, m9
+ shufps m4, m5, q3210
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ movu m5, [grain_lutq+offxyq*2+16]
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ SPLATD m7, [base+pw_27_17_17_27]
+ mov seed, r3m
+%else
+ SPLATD m7, [pw_27_17_17_27]
+%endif
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+1*gprsize]
+ movu m2, [grain_lutq+r5*2]
+%else
+ movu m2, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ REPX {pmaddwd x, m7}, m4, m2
+ REPX {paddd x, m14}, m4, m2
+ REPX {psrad x, 5}, m4, m2
+ packssdw m2, m4
+ pminsw m2, m15
+ pmaxsw m2, m9
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m3, [grain_lutq+r5*2+16]
+%else
+ movu m3, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m7}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m3, m5
+ pminsw m3, m15
+ pmaxsw m3, m9
+
+ ; src
+ pand m0, m10, [srcq+ 0] ; m0-1: src as word
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m4, m2
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2
+%else
+ vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk_v
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ jmp .loop_x_odd_v_overlap
+
+.next_blk_v:
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r0, [rsp+8*mmsize+1*gprsize]
+ add r3, 16
+ add r0, 16
+ mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
+ mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
+
+ mov seed, r3m
+ xor r0, r0
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ movu m4, [grain_lutq+r0*2]
+ movd m5, [grain_lutq+r5*2]
+ mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
+ movd m3, [grain_lutq+r5*2]
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+ movd m5, [grain_lutq+left_offxyq*2]
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd m5, m2
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m6}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m5, m3
+ pminsw m5, m15
+ pmaxsw m5, m9
+ shufps m3, m5, m2, q3210
+ shufps m5, m4, q3232
+ ; followed by v interpolation (top | cur -> cur)
+ movu m0, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m1, [grain_lutq+r0*2+16]
+%else
+ movu m1, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpcklwd m2, m5, m3
+ punpckhwd m5, m3
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ REPX {pmaddwd x, m7}, m2, m5, m3, m1
+ REPX {paddd x, m14}, m2, m5, m3, m1
+ REPX {psrad x, 5}, m2, m5, m3, m1
+ packssdw m2, m5
+ packssdw m3, m1
+ REPX {pminsw x, m15}, m2, m3
+ REPX {pmaxsw x, m9}, m2, m3
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m2, m4
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+ or dword r8m, 4
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov r5, r5m
+ add offxyd, 16
+ add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ add offxyd, 16
+ add top_offxyd, 16
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+%if ARCH_X86_32
+ DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+%endif
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r4, r3m
+ mov r3, r4m
+ mov r5, r5m
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+ mov r0m, r0
+ mov r2m, r2
+ mov r4m, r3
+ mov r5m, r5
+
+ mov r0, r6m
+ mov r2, r7m
+ mov r3, r8m
+ mov r5, r9m
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+ mov r6m, r0
+ mov r7m, r2
+ mov r8m, r3
+ mov r9m, r5
+
+ mov r2, r10m
+ mov r3, r11m
+ mov r5, r12m
+ mov r0, r13m
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+ mov r10m, r2
+ mov r11m, r3
+ mov r12m, r5
+
+ SPLATW m2, r13m
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov srcq, srcm
+ mov fg_dataq, r3m
+%endif
+ LEA r5, $$
+%define base r5-$$
+
+ DECLARE_REG_TMP 0, 2, 3
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+
+ DECLARE_REG_TMP 9, 10, 11
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if STACK_ALIGNMENT >= mmsize
+ mov t0d, r13m ; bdmax
+%endif
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t1d, [t0d*3]
+ mov t2d, r12m
+ inc t2d
+ imul r6d, t2d
+ add t1d, r6d
+ SPLATW m4, [base+max+t1*2]
+%if STACK_ALIGNMENT >= mmsize
+ SPLATW m2, r13m
+%endif
+
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+
+%define mzero m7
+
+%if %3
+ SPLATD m2, [base+pw_23_22]
+%endif
+
+%if ARCH_X86_32
+ mov scalingq, r5m
+ mov r5m, r5
+%else
+ mov r13mp, strideq
+%endif
+
+ pcmpeqw m0, m0
+ psraw m1, m10, 1
+ pxor m0, m1
+
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+ DECLARE_REG_TMP 9
+%endif
+
+%if %1
+ mov r6d, r11m
+ SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4]
+ SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklwd m6, m1, m0
+ SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4]
+ SPLATD m7, [base+pw_4+t0*4]
+ pmullw m5, m7
+%else
+ SPLATD m6, [base+pd_16]
+%if %2
+ mova m5, [base+pw_23_22]
+%else
+ mova m5, [base+pw_27_17_17_27]
+%endif
+%endif
+
+ SCRATCH 6, 14, 6
+ SCRATCH 5, 15, 7
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0
+%else
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ mov t0d, [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz %%no_vertical_overlap
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+
+%%no_vertical_overlap:
+ mov r8m, t0d
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4mp, wq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+%endif
+
+%if %2 == 0
+%%loop_x_odd:
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16] ; m0-1: src as word
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m3, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m6, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m3, m5
+ pmulhrsw m4, m3
+ pmulhrsw m6, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
+
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0m, dstq
+ mov r9m, lumaq
+ mov r4m, wq
+%endif
+%if %2 == 0
+ btc dword r8m, 2
+ jc %%next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%%next_blk:
+%endif
+ test dword r8m, 1
+ je %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2+ 0]
+%endif
+ punpcklwd m5, m7 ; {left0, cur0}
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ pmaddwd m5, [PIC_ptr(pw_23_22)]
+%else
+ pmaddwd m5, [PIC_ptr(pw_27_17_17_27)]
+%endif
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ pmaddwd m5, m15
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m7, q3210
+ movu m3, [grain_lutq+offxyq*2+16]
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m4
+ pmulhrsw m5, m7
+ pmulhrsw m3, m4
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m5
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov r3m, seed
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3, unused4, unused5, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %2 == 0
+%%loop_x_odd_v_overlap:
+%endif
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ movu m5, [grain_lutq+r0*2]
+%else
+ movu m5, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m5, [grain_lutq+r0*2+16]
+%else
+ movu m5, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m7, m5, m4
+ punpcklwd m5, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m4, m5, m7
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m5, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m5, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m5, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m5, m0
+ punpcklwd m5, m0
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ punpckhwd m7, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m5, m6
+ REPX {pmaxsw x, mzero}, m5, m6
+ REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m5, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m5
+ pmulhrsw m3, m7
+ pmulhrsw m4, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+ dec hw
+ jle %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+%if %3
+ jmp %%loop_y
+%else
+ btc hd, 16
+ jc %%loop_y
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_v_overlap
+%endif
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+ btc dword r8m, 2
+ jc %%loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
+
+ mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ add offxyd, 16
+ add t0d, 16
+ mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
+ mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2]
+%endif
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+2*gprsize]
+ movu m4, [grain_lutq+r0*2]
+%if %2
+ pinsrw m5, [grain_lutq+r5*2], 2
+%else
+ movd m3, [grain_lutq+r5*2]
+%endif
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
+%else
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+%endif
+%if %2 == 0
+ punpckldq m5, m3
+%endif
+ punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
+ punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ movddup m0, [PIC_ptr(pw_23_22)]
+%else
+ movddup m0, [PIC_ptr(pw_27_17_17_27)]
+%endif
+%else
+ pshufd m0, m15, q1010
+%endif
+ pmaddwd m5, m0
+%if %1
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
+ shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
+ shufps m5, m4, q3231 ; top0-7 post-h_filter
+
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
+%else
+ REPX {paddd x, m14}, m5, m7
+%endif
+ REPX {psrad x, 5}, m5, m7
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; right half
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m0, [grain_lutq+r0*2+16]
+%else
+ movu m0, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m1, m0
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
+%else
+ REPX {paddd x, m14}, m1, m0
+%endif
+ REPX {psrad x, 5}, m1, m0
+ packssdw m4, m0, m1
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m6, [lumaq+ 0]
+ mova m5, [lumaq+(16<<%2)]
+%if %2
+ phaddw m6, [lumaq+16]
+ phaddw m5, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m6, mzero
+ pavgw m5, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m6, m0
+ punpcklwd m6, m0
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ punpckhwd m7, m5, m1
+ punpcklwd m5, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m6, m5
+ REPX {pmaxsw x, mzero}, m6, m5
+ REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m6, m5
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1
+%else
+%if %3 == 0
+ ; register shortage :)
+ push r12
+%endif
+ vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1
+%if %3 == 0
+ pop r12
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m6
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+ btc hd, 16
+ jc %%loop_y_h_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_hv_overlap
+%%end_y_hv_overlap:
+%endif
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov dstmp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0